Fix Clustering Bug.

This version fixes some bugs in the two difference gap statistic and standarized silhouette index. Also, it fixes errors that appear when we try to determine the optimal number of clusters with nonmonotonic linkage matrices based on centroid and median linkage.
dcajasn · Jul 20, 2024 · 0077b75 · 0077b75
1 parent 2bf5e74
commit 0077b75
Show file tree

Hide file tree

Showing 19 changed files with 3,905 additions and 3,786 deletions.
diff --git a/README.md b/README.md
@@ -262,7 +262,7 @@ If you use Riskfolio-Lib for published work, please use the following BibTeX ent
 ```
 @misc{riskfolio,
       author = {Dany Cajas},
-      title = {Riskfolio-Lib (6.2.0)},
+      title = {Riskfolio-Lib (6.2.1)},
       year  = {2024},
       url   = {https://github.com/dcajasn/Riskfolio-Lib},
       }

diff --git a/docs/source/biblio.bib b/docs/source/biblio.bib
@@ -346,7 +346,6 @@ @inbook{Mansini1
 publisher = {Springer},
 isbn = {978-3-319-18481-4},
 doi = {10.1007/978-3-319-18482-1_2}
-
 }
 
 @article{Mansini2,
@@ -389,11 +388,14 @@ @book{Lamport86
 	Year = 1986}
 
 @book{Fabozzi,
-	author = {J Fabozzi, Frank and Kolm, Petter and Pachamanova, Dessislava and Focardi, Sergio},
-	year = {2007},
-	month = {05},
-	pages = {},
-	title = {Robust Portfolio Optimization and Management}
+title={Robust Portfolio Optimization and Management},
+author={Fabozzi, Frank J and Kolm, Petter N and Pachamanova, Dessislava A and Focardi, Sergio M},
+editor={Fabozzi, Frank J and Kolm, Petter N and Pachamanova, Dessislava
+A and Focardi, Sergio M},
+publisher={John Wiley \& Sons},
+month={05},
+year={2007},
+address={Nashville, TN},
 }
 
 @book{Boyd,
@@ -405,11 +407,13 @@ @book{Boyd
 year={2004}}
 
 @book{MichaudBook,
-author = {Michaud, Richard and Michaud, Robert},
-year = {2008},
+title={Efficient Asset Management: A Practical Guide to Stock Portfolio Optimization and Asset Allocation},
+author={Michaud, Richard and Michaud, Robert},
+isbn={9780199887194},
+series={Financial Management Association Survey and Synthesis},
+year={2008},
 month = {01},
-pages = {},
-title = {Efficient Asset Management: A Practical Guide to Stock Portfolio Optimization and Asset Allocation}
+publisher={Oxford University Press}
 }
 
 @book{Mansini,
@@ -477,7 +481,6 @@ @mastersthesis{Graaf
   address      = {Rapenburg 70, 2311 EZ Leiden, Netherlands},
   month        = 12,
   url={https://www.universiteitleiden.nl/binaries/content/assets/science/mi/scripties/master/de_graaf_tanita_master.pdf},
-
 }
 
 @mastersthesis{Cajas,
@@ -1045,16 +1048,26 @@ @article{Ojeda2015
   journal = {The American Mathematical Monthly}
 }
 
-@inproceedings{Loan1992,
-  title={Approximation with Kronecker Products},
-  author={Charles Van Loan and Nikos Pitsianis},
-  year={1992}
-}
-
-@article{Yang2019,
-  title={Uncertainty Set Sizes, Sensitivity Analysis, in Robust Portfolio Optimization},
-  author={Yang, Mingyu},
-  year={2019}
+@Inbook{VanLoan1993,
+author={Van Loan, C. F. and Pitsianis, N.},
+editor={Moonen, Marc S.	and Golub, Gene H. and De Moor, Bart L. R.},
+title={Approximation with Kronecker Products},
+bookTitle={Linear Algebra for Large Scale and Real-Time Applications},
+year={1993},
+publisher={Springer Netherlands},
+address={Dordrecht},
+pages={293--314},
+isbn={978-94-015-8196-7},
+doi={10.1007/978-94-015-8196-7_17},
+url={https://doi.org/10.1007/978-94-015-8196-7_17}
+}
+
+@mastersthesis{Yang2019,
+title={Uncertainty Set Sizes, Sensitivity Analysis, in Robust Portfolio Optimization},
+author={Yang, Mingyu},
+year={2019}
+school={University of Waterloo},
+url={https://www.math.uwaterloo.ca/~hwolkowi/henry/reports/MingyuYangCM-eresearchpaper-printcopy.pdf},
 }
 
 @article{Tutuncu2004,

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -25,7 +25,7 @@
 copyright = '2020-2024, Dany Cajas'
 author = 'Dany Cajas'
 
-__version__ = "6.2.0"
+__version__ = "6.2.1"
 
 # The short X.Y version
 version = '.'.join(__version__.split('.')[:2])

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -310,7 +310,7 @@ If you use Riskfolio-Lib for published work, please use the following BibTeX ent
 
     @misc{riskfolio,
           author = {Dany Cajas},
-          title = {Riskfolio-Lib (6.2.0)},
+          title = {Riskfolio-Lib (6.2.1)},
           year  = {2024},
           url   = {https://github.com/dcajasn/Riskfolio-Lib},
           }

diff --git a/examples/Tutorial 24 - Hierarchical Risk Parity (HRP) Portfolio Optimization.ipynb b/examples/Tutorial 24 - Hierarchical Risk Parity (HRP) Portfolio Optimization.ipynb
diff --git a/...es/Tutorial 25 - Hierarchical Equal Risk Contribution (HERC) Portfolio Optimization.ipynb b/...es/Tutorial 25 - Hierarchical Equal Risk Contribution (HERC) Portfolio Optimization.ipynb
diff --git a/examples/Tutorial 27 - HERC with Equal Weights within Clusters (HERC2).ipynb b/examples/Tutorial 27 - HERC with Equal Weights within Clusters (HERC2).ipynb
diff --git a/examples/Tutorial 28 - Hierarchical Clustering and Networks.ipynb b/examples/Tutorial 28 - Hierarchical Clustering and Networks.ipynb
diff --git a/...utorial 29 - Hierarchical Risk Parity (HRP) Portfolio Optimization with Constraints.ipynb b/...utorial 29 - Hierarchical Risk Parity (HRP) Portfolio Optimization with Constraints.ipynb
diff --git a/examples/Tutorial 30 - Nested Clustered Optimization (NCO).ipynb b/examples/Tutorial 30 - Nested Clustered Optimization (NCO).ipynb
diff --git a/examples/Tutorial 31 - Hierarchical Portfolios with Custom Covariance.ipynb b/examples/Tutorial 31 - Hierarchical Portfolios with Custom Covariance.ipynb
diff --git a/...Hierarchical Equal Risk Contribution (HERC) Portfolio Optimization with Constraints.ipynb b/...Hierarchical Equal Risk Contribution (HERC) Portfolio Optimization with Constraints.ipynb
diff --git a/...al 45 - Nested Clustered Optimization (NCO) Portfolio Optimization with Constraints.ipynb b/...al 45 - Nested Clustered Optimization (NCO) Portfolio Optimization with Constraints.ipynb
diff --git a/...utorial 46 - Classic Mean Risk Optimization with Network and Dendrogram Constraints.ipynb b/...utorial 46 - Classic Mean Risk Optimization with Network and Dendrogram Constraints.ipynb
diff --git a/riskfolio/src/AuxFunctions.py b/riskfolio/src/AuxFunctions.py
@@ -243,7 +243,7 @@ def cov_returns(cov, seed=0):
 
 def block_vec_pq(A, p, q):
     r"""
-    Calculates block vectorization operator as shown in :cite:`d-Loan1992`
+    Calculates block vectorization operator as shown in :cite:`d-VanLoan1993`
     and :cite:`d-Ojeda2015`.
 
     Parameters
@@ -674,7 +674,7 @@ def ltdi_matrix(X, alpha=0.05):
     return mat
 
 
-def two_diff_gap_stat(dist, clusters, max_k=10):
+def two_diff_gap_stat(dist, clustering, max_k=10):
     r"""
     Calculate the optimal number of clusters based on the two difference gap
     statistic :cite:`d-twogap`.
@@ -683,7 +683,7 @@ def two_diff_gap_stat(dist, clusters, max_k=10):
     ----------
     dist : str, optional
         A distance measure based on the codependence matrix.
-    clusters : str, optional
+    clustering : str, optional
         The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
     max_k : int, optional
         Max number of clusters used by the two difference gap statistic
@@ -699,44 +699,70 @@ def two_diff_gap_stat(dist, clusters, max_k=10):
         ValueError when the value cannot be calculated.
 
     """
+    flag = False
+    # Check if linkage matrix is monotonic
+    if hr.is_monotonic(clustering):
+        flag = True
     # cluster levels over from 1 to N-1 clusters
-    cluster_lvls = pd.DataFrame(hr.cut_tree(clusters), index=dist.columns)
-    num_k = cluster_lvls.columns  # save column with number of clusters
+    cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
+    level_k = cluster_lvls.columns.tolist()
     cluster_lvls = cluster_lvls.iloc[:, ::-1]  # reverse order to start with 1 cluster
-    cluster_lvls.columns = num_k  # set columns to number of cluster
+    cluster_lvls.columns = level_k
+    # Fix for nonmonotonic linkage matrices
+    if flag is False:
+        for i in cluster_lvls.columns:
+            unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
+            cluster_lvls[i] = indices
+    cluster_lvls = cluster_lvls.T.drop_duplicates().T
+    level_k = cluster_lvls.columns.tolist()
+    cluster_k = cluster_lvls.nunique(axis=0).tolist()
     W_list = []
+    n = dist.shape[0]
 
     # get within-cluster dissimilarity for each k
-    for k in range(min(len(num_k), max_k)):
-        level = cluster_lvls.iloc[:, k]  # get k clusters
-        D_list = []  # within-cluster distance list
-
-        for i in range(np.max(level.unique()) + 1):
-            cluster = level.loc[level == i]
-            # Based on correlation distance
-            cluster_dist = dist.loc[cluster.index, cluster.index]  # get distance
-            cluster_pdist = squareform(cluster_dist, checks=False)
-            if cluster_pdist.shape[0] != 0:
-                D = np.nan_to_num(cluster_pdist.mean())
-                D_list.append(D)  # append to list
-
-        W_k = np.sum(D_list)
-        W_list.append(W_k)
+    for k in cluster_k:
+        if k == 1:
+            W_list.append(-np.inf)
+        elif k > min(max_k, np.sqrt(n))+2:
+            break
+        else:
+            level = cluster_lvls[level_k[cluster_k.index(k)]]  # get k clusters
+            D_list = []  # within-cluster distance list
+
+            for i in range(np.max(level.unique()) + 1):
+                cluster = level.loc[level == i]
+                # Based on correlation distance
+                cluster_dist = dist.loc[cluster.index, cluster.index]  # get distance
+                cluster_pdist = squareform(cluster_dist, checks=False)
+                if cluster_pdist.shape[0] != 0:
+                    D = np.nan_to_num(cluster_pdist.std())
+                    D_list.append(D)  # append to list
+
+            W_k = np.sum(D_list)
+            W_list.append(W_k)
 
     W_list = pd.Series(W_list)
-    n = dist.shape[0]
-    limit_k = int(min(max_k, np.sqrt(n)))
-    gaps = W_list.shift(2) + W_list - 2 * W_list.shift(1)
-    gaps = gaps[0:limit_k]
-    if gaps.isna().all():
-        k = len(gaps)
+    gaps = W_list.shift(-2) + W_list - 2 * W_list.shift(-1)
+    k_index = int(gaps.idxmax())
+    k = cluster_k[k_index]
+    node_k = level_k[k_index]
+
+    if flag:
+        clustering_inds = cluster_lvls[node_k].tolist()
     else:
-        k = int(gaps.idxmax() + 2)
+        clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
+        j = len(np.unique(clustering_inds))
+        while k != j:
+            j += 1
+            clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
+            k = len(np.unique(clustering_inds))
+        unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
+        clustering_inds = indices
 
-    return k
+    return k, clustering_inds
 
 
-def std_silhouette_score(dist, clusters, max_k=10):
+def std_silhouette_score(dist, clustering, max_k=10):
     r"""
     Calculate the optimal number of clusters based on the standarized silhouette
     score index :cite:`d-Prado2`.
@@ -745,7 +771,7 @@ def std_silhouette_score(dist, clusters, max_k=10):
     ----------
     dist : str, optional
         A distance measure based on the codependence matrix.
-    clusters : str, optional
+    clustering : str, optional
         The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
     max_k : int, optional
         Max number of clusters used by the standarized silhouette score
@@ -761,29 +787,54 @@ def std_silhouette_score(dist, clusters, max_k=10):
         ValueError when the value cannot be calculated.
 
     """
+    flag = False
+    # Check if linkage matrix is monotonic
+    if hr.is_monotonic(clustering):
+        flag = True
     # cluster levels over from 1 to N-1 clusters
-    cluster_lvls = pd.DataFrame(hr.cut_tree(clusters), index=dist.columns)
-    num_k = cluster_lvls.columns  # save column with number of clusters
+    cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
+    level_k = cluster_lvls.columns.tolist()
     cluster_lvls = cluster_lvls.iloc[:, ::-1]  # reverse order to start with 1 cluster
-    cluster_lvls.columns = num_k  # set columns to number of cluster
+    cluster_lvls.columns = level_k
+    # Fix for nonmonotonic linkage matrices
+    if flag is False:
+        for i in cluster_lvls.columns:
+            unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
+            cluster_lvls[i] = indices
+    cluster_lvls = cluster_lvls.T.drop_duplicates().T
+    level_k = cluster_lvls.columns.tolist()
+    cluster_k = cluster_lvls.nunique(axis=0).tolist()
     scores_list = []
+    n = dist.shape[0]
 
     # get within-cluster dissimilarity for each k
-    for k in range(1, min(len(num_k) - 1, max_k)):
-        level = cluster_lvls.iloc[:, k]  # get k clusters
-        b = silhouette_samples(dist, level)
-        scores_list.append(b.mean() / b.std())
+    for k in cluster_k:
+        if k == 1:
+            scores_list.append(-np.inf)
+        elif k > min(max_k, np.sqrt(n)):
+            break
+        else:
+            level = cluster_lvls[level_k[cluster_k.index(k)]]  # get k clusters
+            b = silhouette_samples(dist, level)
+            scores_list.append(b.mean() / b.std())
 
     scores_list = pd.Series(scores_list)
-    n = dist.shape[0]
-    limit_k = int(min(max_k, np.sqrt(n)))
-    scores_list = scores_list[0:limit_k]
-    if scores_list.isna().all():
-        k = len(scores_list)
+    k_index = int(scores_list.idxmax())
+    k = cluster_k[k_index]
+    node_k = level_k[k_index]
+    if flag:
+        clustering_inds = cluster_lvls[node_k].tolist()
     else:
-        k = int(scores_list.idxmax() + 2)
-
-    return k
+        clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
+        j = len(np.unique(clustering_inds))
+        while k != j:
+            j += 1
+            clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
+            k = len(np.unique(clustering_inds))
+        unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
+        clustering_inds = indices
+
+    return k, clustering_inds
 
 
 def codep_dist(

diff --git a/riskfolio/src/ConstraintsFunctions.py b/riskfolio/src/ConstraintsFunctions.py
@@ -834,19 +834,19 @@ def assets_clusters(
     # optimal number of clusters
     if k is None:
         if opt_k_method == "twodiff":
-            k = af.two_diff_gap_stat(dist, clustering, max_k)
+            k, clustering_inds = af.two_diff_gap_stat(dist, clustering, max_k)
         elif opt_k_method == "stdsil":
-            k = af.std_silhouette_score(dist, clustering, max_k)
+            k, clustering_inds = af.std_silhouette_score(dist, clustering, max_k)
         else:
             raise ValueError("The only opt_k_method available are twodiff and stdsil")
+    else:
+        clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
 
     # Building clusters
-    clusters_inds = hr.fcluster(clustering, k, criterion="maxclust")
     labels = np.array(returns.columns.tolist())
-
     clusters = {"Assets": [], "Clusters": []}
 
-    for i, v in enumerate(clusters_inds):
+    for i, v in enumerate(clustering_inds):
         clusters["Assets"].append(labels[i])
         clusters["Clusters"].append("Cluster " + str(v))