Skip to content

Commit

Permalink
Fix Clustering Bug.
Browse files Browse the repository at this point in the history
This version fixes some bugs in the two difference gap statistic and standarized silhouette index. Also, it fixes errors that appear when we try to determine the optimal number of clusters with nonmonotonic linkage matrices based on centroid and median linkage.
  • Loading branch information
dcajasn committed Jul 20, 2024
1 parent 2bf5e74 commit 0077b75
Show file tree
Hide file tree
Showing 19 changed files with 3,905 additions and 3,786 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ If you use Riskfolio-Lib for published work, please use the following BibTeX ent
```
@misc{riskfolio,
author = {Dany Cajas},
title = {Riskfolio-Lib (6.2.0)},
title = {Riskfolio-Lib (6.2.1)},
year = {2024},
url = {https://github.com/dcajasn/Riskfolio-Lib},
}
Expand Down
55 changes: 34 additions & 21 deletions docs/source/biblio.bib
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,6 @@ @inbook{Mansini1
publisher = {Springer},
isbn = {978-3-319-18481-4},
doi = {10.1007/978-3-319-18482-1_2}

}

@article{Mansini2,
Expand Down Expand Up @@ -389,11 +388,14 @@ @book{Lamport86
Year = 1986}

@book{Fabozzi,
author = {J Fabozzi, Frank and Kolm, Petter and Pachamanova, Dessislava and Focardi, Sergio},
year = {2007},
month = {05},
pages = {},
title = {Robust Portfolio Optimization and Management}
title={Robust Portfolio Optimization and Management},
author={Fabozzi, Frank J and Kolm, Petter N and Pachamanova, Dessislava A and Focardi, Sergio M},
editor={Fabozzi, Frank J and Kolm, Petter N and Pachamanova, Dessislava
A and Focardi, Sergio M},
publisher={John Wiley \& Sons},
month={05},
year={2007},
address={Nashville, TN},
}

@book{Boyd,
Expand All @@ -405,11 +407,13 @@ @book{Boyd
year={2004}}

@book{MichaudBook,
author = {Michaud, Richard and Michaud, Robert},
year = {2008},
title={Efficient Asset Management: A Practical Guide to Stock Portfolio Optimization and Asset Allocation},
author={Michaud, Richard and Michaud, Robert},
isbn={9780199887194},
series={Financial Management Association Survey and Synthesis},
year={2008},
month = {01},
pages = {},
title = {Efficient Asset Management: A Practical Guide to Stock Portfolio Optimization and Asset Allocation}
publisher={Oxford University Press}
}

@book{Mansini,
Expand Down Expand Up @@ -477,7 +481,6 @@ @mastersthesis{Graaf
address = {Rapenburg 70, 2311 EZ Leiden, Netherlands},
month = 12,
url={https://www.universiteitleiden.nl/binaries/content/assets/science/mi/scripties/master/de_graaf_tanita_master.pdf},

}

@mastersthesis{Cajas,
Expand Down Expand Up @@ -1045,16 +1048,26 @@ @article{Ojeda2015
journal = {The American Mathematical Monthly}
}

@inproceedings{Loan1992,
title={Approximation with Kronecker Products},
author={Charles Van Loan and Nikos Pitsianis},
year={1992}
}

@article{Yang2019,
title={Uncertainty Set Sizes, Sensitivity Analysis, in Robust Portfolio Optimization},
author={Yang, Mingyu},
year={2019}
@Inbook{VanLoan1993,
author={Van Loan, C. F. and Pitsianis, N.},
editor={Moonen, Marc S. and Golub, Gene H. and De Moor, Bart L. R.},
title={Approximation with Kronecker Products},
bookTitle={Linear Algebra for Large Scale and Real-Time Applications},
year={1993},
publisher={Springer Netherlands},
address={Dordrecht},
pages={293--314},
isbn={978-94-015-8196-7},
doi={10.1007/978-94-015-8196-7_17},
url={https://doi.org/10.1007/978-94-015-8196-7_17}
}

@mastersthesis{Yang2019,
title={Uncertainty Set Sizes, Sensitivity Analysis, in Robust Portfolio Optimization},
author={Yang, Mingyu},
year={2019}
school={University of Waterloo},
url={https://www.math.uwaterloo.ca/~hwolkowi/henry/reports/MingyuYangCM-eresearchpaper-printcopy.pdf},
}

@article{Tutuncu2004,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
copyright = '2020-2024, Dany Cajas'
author = 'Dany Cajas'

__version__ = "6.2.0"
__version__ = "6.2.1"

# The short X.Y version
version = '.'.join(__version__.split('.')[:2])
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ If you use Riskfolio-Lib for published work, please use the following BibTeX ent

@misc{riskfolio,
author = {Dany Cajas},
title = {Riskfolio-Lib (6.2.0)},
title = {Riskfolio-Lib (6.2.1)},
year = {2024},
url = {https://github.com/dcajasn/Riskfolio-Lib},
}
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,138 changes: 571 additions & 567 deletions examples/Tutorial 27 - HERC with Equal Weights within Clusters (HERC2).ipynb

Large diffs are not rendered by default.

324 changes: 189 additions & 135 deletions examples/Tutorial 28 - Hierarchical Clustering and Networks.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,042 changes: 521 additions & 521 deletions examples/Tutorial 30 - Nested Clustered Optimization (NCO).ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

143 changes: 97 additions & 46 deletions riskfolio/src/AuxFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def cov_returns(cov, seed=0):

def block_vec_pq(A, p, q):
r"""
Calculates block vectorization operator as shown in :cite:`d-Loan1992`
Calculates block vectorization operator as shown in :cite:`d-VanLoan1993`
and :cite:`d-Ojeda2015`.
Parameters
Expand Down Expand Up @@ -674,7 +674,7 @@ def ltdi_matrix(X, alpha=0.05):
return mat


def two_diff_gap_stat(dist, clusters, max_k=10):
def two_diff_gap_stat(dist, clustering, max_k=10):
r"""
Calculate the optimal number of clusters based on the two difference gap
statistic :cite:`d-twogap`.
Expand All @@ -683,7 +683,7 @@ def two_diff_gap_stat(dist, clusters, max_k=10):
----------
dist : str, optional
A distance measure based on the codependence matrix.
clusters : str, optional
clustering : str, optional
The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
max_k : int, optional
Max number of clusters used by the two difference gap statistic
Expand All @@ -699,44 +699,70 @@ def two_diff_gap_stat(dist, clusters, max_k=10):
ValueError when the value cannot be calculated.
"""
flag = False
# Check if linkage matrix is monotonic
if hr.is_monotonic(clustering):
flag = True
# cluster levels over from 1 to N-1 clusters
cluster_lvls = pd.DataFrame(hr.cut_tree(clusters), index=dist.columns)
num_k = cluster_lvls.columns # save column with number of clusters
cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
level_k = cluster_lvls.columns.tolist()
cluster_lvls = cluster_lvls.iloc[:, ::-1] # reverse order to start with 1 cluster
cluster_lvls.columns = num_k # set columns to number of cluster
cluster_lvls.columns = level_k
# Fix for nonmonotonic linkage matrices
if flag is False:
for i in cluster_lvls.columns:
unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
cluster_lvls[i] = indices
cluster_lvls = cluster_lvls.T.drop_duplicates().T
level_k = cluster_lvls.columns.tolist()
cluster_k = cluster_lvls.nunique(axis=0).tolist()
W_list = []
n = dist.shape[0]

# get within-cluster dissimilarity for each k
for k in range(min(len(num_k), max_k)):
level = cluster_lvls.iloc[:, k] # get k clusters
D_list = [] # within-cluster distance list

for i in range(np.max(level.unique()) + 1):
cluster = level.loc[level == i]
# Based on correlation distance
cluster_dist = dist.loc[cluster.index, cluster.index] # get distance
cluster_pdist = squareform(cluster_dist, checks=False)
if cluster_pdist.shape[0] != 0:
D = np.nan_to_num(cluster_pdist.mean())
D_list.append(D) # append to list

W_k = np.sum(D_list)
W_list.append(W_k)
for k in cluster_k:
if k == 1:
W_list.append(-np.inf)
elif k > min(max_k, np.sqrt(n))+2:
break
else:
level = cluster_lvls[level_k[cluster_k.index(k)]] # get k clusters
D_list = [] # within-cluster distance list

for i in range(np.max(level.unique()) + 1):
cluster = level.loc[level == i]
# Based on correlation distance
cluster_dist = dist.loc[cluster.index, cluster.index] # get distance
cluster_pdist = squareform(cluster_dist, checks=False)
if cluster_pdist.shape[0] != 0:
D = np.nan_to_num(cluster_pdist.std())
D_list.append(D) # append to list

W_k = np.sum(D_list)
W_list.append(W_k)

W_list = pd.Series(W_list)
n = dist.shape[0]
limit_k = int(min(max_k, np.sqrt(n)))
gaps = W_list.shift(2) + W_list - 2 * W_list.shift(1)
gaps = gaps[0:limit_k]
if gaps.isna().all():
k = len(gaps)
gaps = W_list.shift(-2) + W_list - 2 * W_list.shift(-1)
k_index = int(gaps.idxmax())
k = cluster_k[k_index]
node_k = level_k[k_index]

if flag:
clustering_inds = cluster_lvls[node_k].tolist()
else:
k = int(gaps.idxmax() + 2)
clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
j = len(np.unique(clustering_inds))
while k != j:
j += 1
clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
k = len(np.unique(clustering_inds))
unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
clustering_inds = indices

return k
return k, clustering_inds


def std_silhouette_score(dist, clusters, max_k=10):
def std_silhouette_score(dist, clustering, max_k=10):
r"""
Calculate the optimal number of clusters based on the standarized silhouette
score index :cite:`d-Prado2`.
Expand All @@ -745,7 +771,7 @@ def std_silhouette_score(dist, clusters, max_k=10):
----------
dist : str, optional
A distance measure based on the codependence matrix.
clusters : str, optional
clustering : str, optional
The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
max_k : int, optional
Max number of clusters used by the standarized silhouette score
Expand All @@ -761,29 +787,54 @@ def std_silhouette_score(dist, clusters, max_k=10):
ValueError when the value cannot be calculated.
"""
flag = False
# Check if linkage matrix is monotonic
if hr.is_monotonic(clustering):
flag = True
# cluster levels over from 1 to N-1 clusters
cluster_lvls = pd.DataFrame(hr.cut_tree(clusters), index=dist.columns)
num_k = cluster_lvls.columns # save column with number of clusters
cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
level_k = cluster_lvls.columns.tolist()
cluster_lvls = cluster_lvls.iloc[:, ::-1] # reverse order to start with 1 cluster
cluster_lvls.columns = num_k # set columns to number of cluster
cluster_lvls.columns = level_k
# Fix for nonmonotonic linkage matrices
if flag is False:
for i in cluster_lvls.columns:
unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
cluster_lvls[i] = indices
cluster_lvls = cluster_lvls.T.drop_duplicates().T
level_k = cluster_lvls.columns.tolist()
cluster_k = cluster_lvls.nunique(axis=0).tolist()
scores_list = []
n = dist.shape[0]

# get within-cluster dissimilarity for each k
for k in range(1, min(len(num_k) - 1, max_k)):
level = cluster_lvls.iloc[:, k] # get k clusters
b = silhouette_samples(dist, level)
scores_list.append(b.mean() / b.std())
for k in cluster_k:
if k == 1:
scores_list.append(-np.inf)
elif k > min(max_k, np.sqrt(n)):
break
else:
level = cluster_lvls[level_k[cluster_k.index(k)]] # get k clusters
b = silhouette_samples(dist, level)
scores_list.append(b.mean() / b.std())

scores_list = pd.Series(scores_list)
n = dist.shape[0]
limit_k = int(min(max_k, np.sqrt(n)))
scores_list = scores_list[0:limit_k]
if scores_list.isna().all():
k = len(scores_list)
k_index = int(scores_list.idxmax())
k = cluster_k[k_index]
node_k = level_k[k_index]
if flag:
clustering_inds = cluster_lvls[node_k].tolist()
else:
k = int(scores_list.idxmax() + 2)

return k
clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
j = len(np.unique(clustering_inds))
while k != j:
j += 1
clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
k = len(np.unique(clustering_inds))
unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
clustering_inds = indices

return k, clustering_inds


def codep_dist(
Expand Down
10 changes: 5 additions & 5 deletions riskfolio/src/ConstraintsFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,19 +834,19 @@ def assets_clusters(
# optimal number of clusters
if k is None:
if opt_k_method == "twodiff":
k = af.two_diff_gap_stat(dist, clustering, max_k)
k, clustering_inds = af.two_diff_gap_stat(dist, clustering, max_k)
elif opt_k_method == "stdsil":
k = af.std_silhouette_score(dist, clustering, max_k)
k, clustering_inds = af.std_silhouette_score(dist, clustering, max_k)
else:
raise ValueError("The only opt_k_method available are twodiff and stdsil")
else:
clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")

# Building clusters
clusters_inds = hr.fcluster(clustering, k, criterion="maxclust")
labels = np.array(returns.columns.tolist())

clusters = {"Assets": [], "Clusters": []}

for i, v in enumerate(clusters_inds):
for i, v in enumerate(clustering_inds):
clusters["Assets"].append(labels[i])
clusters["Clusters"].append("Cluster " + str(v))

Expand Down
Loading

0 comments on commit 0077b75

Please sign in to comment.