diff --git a/cdlib/__init__.py b/cdlib/__init__.py index 3533b526..f1ef144b 100644 --- a/cdlib/__init__.py +++ b/cdlib/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.3.1' +__version__ = "0.4.0" from cdlib.classes.node_clustering import NodeClustering from cdlib.classes.edge_clustering import EdgeClustering from cdlib.classes.fuzzy_node_clustering import FuzzyNodeClustering @@ -6,3 +6,4 @@ from cdlib.classes.bipartite_node_clustering import BiNodeClustering from cdlib.classes.temporal_clustering import TemporalClustering from cdlib.classes.named_clustering import NamedClustering +from cdlib.lifecycles import LifeCycle, CommunityEvent diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py index d87fde73..19314416 100644 --- a/cdlib/algorithms/crisp_partition.py +++ b/cdlib/algorithms/crisp_partition.py @@ -517,13 +517,12 @@ def louvain( ========== ======== ======== :param g_original: a networkx/igraph object - :param partition : NodeClustering object, optional the algorithm will start using this partition of the nodes. - :param weight: str, optional the key in graph to use as weight. Default to 'weight' + :param partition: NodeClustering object, optional the algorithm will start using this partition of the nodes + :param weight: str, optional the key in graph to use as weight. Default to "weight" :param resolution: double, optional Will change the size of the communities, default to 1. - :param randomize: int, RandomState instance or None, optional (default=None). If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. + :param randomize: int, RandomState instance or None, optional (default=None). :return: NodeClustering object - :Example: >>> from cdlib import algorithms @@ -536,6 +535,7 @@ def louvain( Blondel, Vincent D., et al. `Fast unfolding of communities in large networks. `_ Journal of statistical mechanics: theory and experiment 2008.10 (2008): P10008. .. note:: Reference implementation: https://github.com/taynaud/python-louvain + """ g = convert_graph_formats(g_original, nx.Graph) @@ -2689,9 +2689,21 @@ def paris(g_original: object) -> NodeClustering: .. note:: Reference implementation: https://github.com/tbonald/paris """ + g = convert_graph_formats(g_original, nx.Graph) - D = paris_alg(g) - clustering = paris_best_clustering(D) + + dmap = {n: i for i, n in enumerate(g.nodes)} + reverse_map = {i: n for n, i in dmap.items()} + nx.relabel_nodes(g_original, dmap, False) + + D = paris_alg(g_original) + coms = paris_best_clustering(D) + + clustering = [] + + for com in coms: + com = [reverse_map[c] for c in com] + clustering.append(com) return NodeClustering( clustering, g_original, "Paris", method_parameters={}, overlap=False diff --git a/cdlib/algorithms/temporal_partition.py b/cdlib/algorithms/temporal_partition.py index a724264c..0bdf3ec2 100644 --- a/cdlib/algorithms/temporal_partition.py +++ b/cdlib/algorithms/temporal_partition.py @@ -1,5 +1,6 @@ from cdlib import TemporalClustering, NamedClustering from cdlib.algorithms.internal_dcd.eTILES import eTILES +import networkx as nx __all__ = ["tiles"] @@ -34,7 +35,7 @@ def tiles(dg: object, obs: int = 1) -> TemporalClustering: :References: - Rossetti, Giulio; Pappalardo, Luca; Pedreschi, Dino, and Giannotti, Fosca. `Tiles: an online algorithm for community discovery in dynamic social networks.`_ Machine Learning (2016), 106(8), 1213-1241. + Rossetti, Giulio; Pappalardo, Luca; Pedreschi, Dino, and Giannotti, Fosca. Tiles: an online algorithm for community discovery in dynamic social networks. Machine Learning (2016), 106(8), 1213-1241. """ alg = eTILES(dg=dg, obs=obs) tc = TemporalClustering() @@ -57,8 +58,10 @@ def tiles(dg: object, obs: int = 1) -> TemporalClustering: mtc = alg.get_matches() tc.add_matching(mtc) + ### polytree + # cleaning & updating community matching - dg = tc.lifecycle_polytree(None, False) + dg = __lifecycle_polytree(tc) community_ids = list(dg.nodes()) tids = tc.get_observation_ids() @@ -77,3 +80,22 @@ def tiles(dg: object, obs: int = 1) -> TemporalClustering: tc.add_matching(mtc) return tc + + +def __lifecycle_polytree(tc) -> nx.DiGraph: + """ + Reconstruct the poly-tree representing communities lifecycles using a provided similarity function. + """ + + lifecycle = tc.matching + + pt = nx.DiGraph() + if len(lifecycle[0]) == 3: + for u, v, w in lifecycle: + pt.add_edge(u, v, weight=w) + else: + # implicit matching + for u, v in lifecycle: + pt.add_edge(u, v) + + return pt diff --git a/cdlib/classes/node_clustering.py b/cdlib/classes/node_clustering.py index fd454a7d..badf25c7 100644 --- a/cdlib/classes/node_clustering.py +++ b/cdlib/classes/node_clustering.py @@ -704,9 +704,9 @@ def normalized_mutual_information( :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.normalized_mutual_information(leiden_communities) @@ -728,9 +728,9 @@ def overlapping_normalized_mutual_information_LFK( :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.overlapping_normalized_mutual_information_LFK(leiden_communities) @@ -782,9 +782,9 @@ def omega(self, clustering: Clustering) -> evaluation.MatchingResult: :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.omega(leiden_communities) @@ -805,9 +805,9 @@ def f1(self, clustering: Clustering) -> evaluation.MatchingResult: :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.f1(leiden_communities) @@ -828,9 +828,9 @@ def nf1(self, clustering: Clustering) -> evaluation.MatchingResult: :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.nf1(leiden_communities) @@ -871,9 +871,9 @@ def adjusted_rand_index(self, clustering: Clustering) -> evaluation.MatchingResu :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.adjusted_rand_index(leiden_communities) @@ -915,9 +915,9 @@ def adjusted_mutual_information( :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.adjusted_mutual_information(leiden_communities) @@ -942,9 +942,9 @@ def variation_of_information( :Example: - >>> from cdlib.algorithms import louvain + >>> from cdlib import algorithms >>> g = nx.karate_club_graph() - >>> communities = louvain(g) + >>> communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> mod = communities.variation_of_information(leiden_communities) @@ -954,3 +954,522 @@ def variation_of_information( 1. Meila, M. (2007). **Comparing clusterings - an information based distance.** Journal of Multivariate Analysis, 98, 873-895. doi:10.1016/j.jmva.2006.11.013 """ return evaluation.variation_of_information(self, clustering) + + def partition_closeness_simple( + self, clustering: Clustering + ) -> evaluation.MatchingResult: + """Community size density closeness. + Simple implementation that does not leverage kernel density estimator. + + $$ S_G(A,B) = \\frac{1}{2} \Sum_{i=1}^{r}\Sum_{j=1}^{s} min(\\frac{n^a(x^a_i)}{N^a}, \\frac{n^b_j(x^b_j)}{N^b}) \delta(x_i^a,x_j^b) $$ + + where: + + $$ N^a $$ total number of communities in A of any size; + $$ x^a $$ ordered list of community sizes for A; + $$ n^a $$ multiplicity of community sizes for A. + + (symmetrically for B) + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.partition_closeness_simple(leiden_communities) + + :Reference: + + 1. Dao, Vinh-Loc, Cécile Bothorel, and Philippe Lenca. "Estimating the similarity of community detection methods based on cluster size distribution." International Conference on Complex Networks and their Applications. Springer, Cham, 2018. + """ + return evaluation.partition_closeness_simple(self, clustering) + + def ecs( + self, + clustering: object, + alpha: float = 0.9, + r: float = 1.0, + r2: float = None, + rescale_path_type: str = "max", + ppr_implementation: str = "prpack", + ) -> evaluation.MatchingResult: + """ + The element-centric clustering similarity. + + :param clustering: NodeClustering object + :param alpha: The personalized page-rank return probability as a float in [0,1]. float, default 0.9 + :param r: The hierarchical scaling parameter for clustering1. float, default 1.0 + :param r2: The hierarchical scaling parameter for clustering2. float, default None + :param rescale_path_type: rescale the hierarchical height by: 'max' the maximum path from the root; 'min' the minimum path form the root; 'linkage' use the linkage distances in the clustering. + :param ppr_implementation: Choose an implementation for personalized page-rank calculation: 'prpack' use PPR algorithms in igraph; 'power_iteration': use power_iteration method. + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.ecs(leiden_communities) + + :Reference: + + A.J. Gates, I.B. Wood, W.P. Hetrick, and YY Ahn [2019]. "Element-centric clustering comparison unifies overlaps and hierarchy". Scientific Reports 9, 8574 + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.ecs( + self, + clustering, + alpha=alpha, + r=r, + r2=r2, + rescale_path_type=rescale_path_type, + ppr_implementation=ppr_implementation, + ) + + def jaccard_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Jaccard index between two clusterings. + + .. math:: J = \\frac{N11}{(N11+N10+N01)} + + + :param clustering: NodeClustering object + :return: MatchingResult object + + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.jaccard_index(leiden_communities) + + :Reference: + + Paul Jaccard. The distribution of the flora in the alpine zone. New Phytologist, 11(2):37–50, 1912. + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.jaccard_index(self, clustering) + + def rand_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Rand index between two clusterings. + + .. math:: RI = \\frac{(N11 + N00)}{(N11 + N10 + N01 + N00)} + + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.rand_index(leiden_communities) + + :Reference: + + William M Rand. Objective Criteria for the Evaluation of Clustering Methods. Journal of the American Statistical Association, 66(336):846, 1971. + + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.rand_index(self, clustering) + + def fowlkes_mallows_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Fowlkes and Mallows index between two clusterings + + .. math:: FM = \\frac{N11}{ \sqrt{ (N11 + N10) * (N11 + N01) }} + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.fowlkes_mallows_index(leiden_communities) + + :Reference: + + Edward B. Fowlkes and Colin L. Mallows. A method for comparing two hierarchical clusterings. Journal of the American Statistical Association, 78(383):553–569, 1983. + + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.fowlkes_mallows_index(self, clustering) + + def classification_error( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Jaccard index between two clusterings. + + .. math:: CE = 1 - PI + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.classification_error(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.classification_error(self, clustering) + + def czekanowski_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + + This function calculates the Czekanowski between two clusterings. + + Also known as: + Dice Symmetric index + Sorensen index + + .. math:: F = \\frac{2*N11}{(2*N11 + N10 + N01)} + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.czekanowski_index(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.czekanowski_index(self, clustering) + + def dice_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Czekanowski between two clusterings. + + Also known as: + Czekanowski index + Sorensen index + + .. math:: F = \\frac{2*N11}{(2*N11 + N10 + N01)} + + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.dice_index(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + + return evaluation.dice_index(self, clustering) + + def sorensen_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Sorensen between two clusterings. + + Also known as: + Czekanowski index + Dice index + + .. math:: F = \\frac{2*N11}{(2*N11 + N10 + N01)} + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.sorensen_index(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + + """ + + return evaluation.sorensen_index(self, clustering) + + def rogers_tanimoto_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Rogers and Tanimoto index between two clusterings. + + .. math:: RT = \\frac{(N11 + N00)}{(N11 + 2*(N10+N01) + N00)} + + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.rogers_tanimoto_index(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.rogers_tanimoto_index(self, clustering) + + def southwood_index( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Southwood index between two clusterings. + + .. math:: \\frac{N11}{(N10 + N01)} + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.southwood_index(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.southwood_index(self, clustering) + + def mi( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the Mutual Information (MI) between two clusterings. + + .. math:: MI = (S(c1) + S(c2) - S(c1, c2)) + + where S(c1) is the Shannon Entropy of the clustering size distribution, S(c1, c2) is the Shannon Entropy of the join clustering size distribution, + + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.mi(leiden_communities) + + :Reference: + + Leon Danon, Albert D ıaz-Guilera, Jordi Duch, and Alex Arenas. Comparing community structure identification. Journal of Statistical Mechanics: Theory and Experiment, 2005(09):P09008–P09008, September 2005. + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.mi(self, clustering) + + def rmi( + self, + clustering: object, + norm_type: str = "none", + logbase: int = 2, + ) -> evaluation.MatchingResult: + """ + This function calculates the Reduced Mutual Information (RMI) between two clusterings. + + .. math:: RMI = MI(c1, c2) - \\log \\frac{Omega(a, b)}{n} + + where MI(c1, c2) is mutual information of the clusterings c1 and c2, and Omega(a, b) is the number of contingency tables with row and column sums equal to a and b. + + :param clustering: NodeClustering object + :param norm_type: The normalization types are: 'none' returns the RMI without a normalization; 'normalized' returns the RMI with upper bound equals to 1. + :param logbase: int, default 2 + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.rmi(leiden_communities) + + :Reference: + + M. E. J. Newman, George T. Cantwell, and Jean-Gabriel Young. Improved mutual information measure for classification and community detection. arXiv:1907.12581, 2019. + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.rmi(self, clustering, norm_type=norm_type, logbase=logbase) + + def geometric_accuracy( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the geometric accuracy between two (overlapping) clusterings. + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.geometric_accuracy(leiden_communities) + + :Reference: + + Tamás Nepusz, Haiyuan Yu, and Alberto Paccanaro. Detecting overlapping protein complexes in protein-protein interaction networks. Nature Methods, 9(5):471–472, 2012. + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.geometric_accuracy(self, clustering) + + def overlap_quality( + self, + clustering: object, + ) -> evaluation.MatchingResult: + """ + This function calculates the overlap quality between two (overlapping) clusterings. + + :param clustering: NodeClustering object + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.overlap_quality(leiden_communities) + + :Reference: + + Yong-Yeol Ahn, James P Bagrow, and Sune Lehmann. Link communities reveal multiscale complexity in networks. Nature, 466(7307):761–764, June 2010. + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.overlap_quality(self, clustering) + + def sample_expected_sim( + self, + clustering: object, + measure: str = "jaccard_index", + random_model: str = "perm", + n_samples: int = 1, + keep_samples: bool = False, + ) -> evaluation.MatchingResult: + """ + This function calculates the expected Similarity for all pair-wise comparisons between Clusterings drawn from one of six random models. + + .. note:: Clustering 2 is considered the gold-standard clustering for one-sided expectations + + + :param clustering: NodeClustering object + :param measure: The similarity measure to evaluate. Must be one of [ecs, jaccard_index, rand_index, fowlkes_mallows_index, classification_error, czekanowski_index, dice_index, sorensen_index, rogers_tanimoto_index, southwood_index, mi, rmi, vi, geometric_accuracy, overlap_quality, sample_expected_sim] + :param random_model: The random model to use: + + 'all' : uniform distribution over the set of all clusterings of + n_elements + + 'all1' : one-sided selection from the uniform distribution over the set + of all clusterings of n_elements + + 'num' : uniform distribution over the set of all clusterings of + n_elements in n_clusters + + 'num1' : one-sided selection from the uniform distribution over the set + of all clusterings of n_elements in n_clusters + + 'perm' : the permutation model for a fixed cluster size sequence + + 'perm1' : one-sided selection from the permutation model for a fixed + cluster size sequence, same as 'perm' + + :param n_samples: The number of random Clusterings sampled to determine the expected similarity. + :param keep_samples: If True, returns the Similarity samples themselves, otherwise return their mean. + :return: MatchingResult object + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> g = nx.karate_club_graph() + >>> louvain_communities = algorithms.louvain(g) + >>> leiden_communities = algorithms.leiden(g) + >>> louvain_communities.sample_expected_sim(leiden_communities) + + .. note:: The function requires the clusim library to be installed. You can install it via pip: pip install clusim + """ + return evaluation.sample_expected_sim( + self, + clustering, + measure=measure, + random_model=random_model, + n_samples=n_samples, + keep_samples=keep_samples, + ) diff --git a/cdlib/classes/temporal_clustering.py b/cdlib/classes/temporal_clustering.py index 61b2c7b7..09238f68 100644 --- a/cdlib/classes/temporal_clustering.py +++ b/cdlib/classes/temporal_clustering.py @@ -150,107 +150,3 @@ def get_explicit_community_match(self) -> list: cid is the position of the community within the Clustering object. """ return self.matching - - def community_matching( - self, method: Callable[[set, set], float], two_sided: bool = False - ) -> list: - """ - Reconstruct community matches across adjacent observations using a provided similarity function. - - :param method: a set similarity function with co-domain in [0,1] (e.g., Jaccard) - :param two_sided: boolean. - Whether the match has to be applied only from the past to the future (False, default) - or even from the future to the past (True) - :return: a list of tuples [(Ti_Ca, Tj_Cb, score), ... ]. - Community names are assigned following the pattern {tid}_{cid}, where tid is the time of observation and - cid is the position of the community within the Clustering object. - """ - - if self.matching is not None: - return self.matching - - lifecycle = [] - - for i in range(self.current_observation - 1): - c_i = self.clusterings[i] - c_j = self.clusterings[i + 1] - for name_i, com_i in c_i.named_communities.items(): - - # name_i = f"{self.obs_to_time[i]}_{cid_i}" - best_match = [] - best_score = 0 - - for name_j, com_j in c_j.named_communities.items(): - # name_j = f"{self.obs_to_time[i+1]}_{cid_j}" - - match = method(com_i, com_j) - if match > best_score: - best_match = [name_j] - best_score = match - elif match == best_score: - best_match.append(name_j) - - for j in best_match: - lifecycle.append((name_i, j, best_score)) - - if two_sided: - - for i in range(self.current_observation - 1, 0, -1): - c_i = self.clusterings[i] - c_j = self.clusterings[i - 1] - - for name_i, com_i in c_i.named_communities.items(): - # name_i = f"{self.obs_to_time[i]}_{cid_i}" - best_match = [] - best_score = 0 - - for name_j, com_j in c_j.named_communities.items(): - # name_j = f"{self.obs_to_time[i-1]}_{cid_j}" - - match = method(com_i, com_j) - if match > best_score: - best_match = [name_j] - best_score = match - elif match == best_score: - best_match.append(name_j) - - for j in best_match: - lifecycle.append((j, name_i, best_score)) - - self.matched = lifecycle - - return lifecycle - - def lifecycle_polytree( - self, method: Callable[[set, set], float] = None, two_sided: bool = False - ) -> nx.DiGraph: - """ - Reconstruct the poly-tree representing communities lifecycles using a provided similarity function. - - :param method: a set similarity function with co-domain in [0,1] (e.g., Jaccard) - :param two_sided: boolean. - Whether the match has to be applied only from the past to the future (False, default) - or even from the future to the past (True) - :return: a networkx DiGraph object. - Nodes represent communities, their ids are assigned following the pattern {tid}_{cid}, - where tid is the time of observation and - cid is the position of the community within the Clustering object. - """ - - if self.matching is not None: - lifecycle = self.matching - else: - if method is None: - raise ValueError("method parameter not specified") - lifecycle = self.community_matching(method, two_sided) - - pt = nx.DiGraph() - if len(lifecycle[0]) == 3: - for u, v, w in lifecycle: - pt.add_edge(u, v, weight=w) - else: - # implicit matching - for u, v in lifecycle: - pt.add_edge(u, v) - - return pt diff --git a/cdlib/lifecycles/__init__.py b/cdlib/lifecycles/__init__.py new file mode 100644 index 00000000..9f8aea09 --- /dev/null +++ b/cdlib/lifecycles/__init__.py @@ -0,0 +1,2 @@ +from .classes import LifeCycle, CommunityEvent +from .algorithms import * diff --git a/cdlib/lifecycles/algorithms/__init__.py b/cdlib/lifecycles/algorithms/__init__.py new file mode 100644 index 00000000..d9d5616a --- /dev/null +++ b/cdlib/lifecycles/algorithms/__init__.py @@ -0,0 +1,4 @@ +from .event_analysis import * +from .classic_match import * +from .measures import * +from .null_model import * diff --git a/cdlib/lifecycles/algorithms/classic_match.py b/cdlib/lifecycles/algorithms/classic_match.py new file mode 100644 index 00000000..d61f8796 --- /dev/null +++ b/cdlib/lifecycles/algorithms/classic_match.py @@ -0,0 +1,271 @@ +from itertools import combinations + +__all__ = ["events_asur", "event_graph_greene"] + + +def _asur_merge_score(t: set, R: list) -> float: + """ + Compute the asur Merge score. + + defined as the ratio of the intersection of the target set and the union of the reference sets + over the size of the largest set (either the target or the union of the reference sets) + + :param t: target set + :param R: list of reference sets + :return: Merge score + """ + union_reference = set.union(*R) + nodes = union_reference.intersection(t) + res = len(nodes) / len(max([union_reference, t], key=len)) + + return res + + +def _greene_merge_score(t: set, R: set) -> float: + """ + Compute the greene Merge score. + based on the jaccard index + + :param t: target set + :param R: reference set + :return: Merge score + """ + + return len(t.intersection(R)) / len(t.union(R)) + + +def _find_asur_merge_events(lc: object, th: float) -> tuple: + """ + Find Merge events in a lifecycle according to Asur et al. + + :param lc: the lifecycle object + :param th: cluster integrity threshold + :return: dictionary of Merge events + """ + events = [] + flows = [] + for t in lc.temporal_ids()[1:]: # start from the second time step + for set_name in lc.get_partition_at(t): + target = lc.get_group(set_name) + flow = lc.group_flow(set_name, "-") + r_names = list(flow.keys()) # names of the reference sets + # compute for all pair of reference sets (combinations) + for r1, r2 in combinations(r_names, 2): + merge_score = _asur_merge_score( + target, + [lc.get_group(r1), lc.get_group(r2)], + ) + + if merge_score > th: + events.append( + { + "src": set_name, + "type": "Merge", + "score": merge_score, + "ref_sets": [r1, r2], # names of the reference sets + } + ) + + flows.append( + { + "src": set_name, + "type": "Merge", + "target": r1, + "flow": lc.get_group(r1).intersection( + lc.get_group(set_name) + ), + } + ) + flows.append( + { + "src": set_name, + "type": "Merge", + "target": r2, + "flow": lc.get_group(r2).intersection( + lc.get_group(set_name) + ), + } + ) + + return events, flows + + +def _find_asur_split_events(lc: object, th: float) -> tuple: + """ + Find Merge events in a lifecycle according to Asur et al. + + :param lc: the lifecycle object + :param th: cluster integrity threshold + :return: dictionary of Merge events + """ + events, flows = [], [] + for t in lc.temporal_ids()[0:]: # start from the second time step + for set_name in lc.get_partition_at(t): + target = lc.get_group(set_name) + flow = lc.group_flow(set_name, "+") + r_names = list(flow.keys()) # names of the reference sets + # compute for all pair of reference sets (combinations) + for r1, r2 in combinations(r_names, 2): + merge_score = _asur_merge_score( + target, [lc.get_group(r1), lc.get_group(r2)] + ) + + if merge_score > th: + events.append( + { + "src": set_name, + "type": "Split", + "score": merge_score, + "ref_sets": [r1, r2], # names of the reference sets + } + ) + + flows.append( + { + "src": set_name, + "type": "Merge", + "target": r1, + "flow": lc.get_group(r1).intersection( + lc.get_group(set_name) + ), + } + ) + flows.append( + { + "src": set_name, + "type": "Merge", + "target": r2, + "flow": lc.get_group(r2).intersection( + lc.get_group(set_name) + ), + } + ) + + return events, flows + + +def _find_asur_birth_events(lc: object) -> list: + """ + Find continue events in a lifecycle according to Asur et al. + + :param lc: the lifecycle object + :return: dictionary of continue events + """ + events = [] + for t in lc.temporal_ids()[1:]: # start from the second time step + for set_name in lc.get_partition_at(t): + flow = lc.group_flow(set_name, "-") + r_names = list(flow.keys()) # names of the reference sets + if len(r_names) == 0: + events.append({"src": set_name, "type": "Birth"}) + return events + + +def _find_asur_death_events(lc: object) -> list: + """ + Find continue events in a lifecycle according to Asur et al. + + :param lc: the lifecycle object + :return: dictionary of continue events + """ + events = [] + for t in lc.temporal_ids()[0:-1]: # start from the second time step + for set_name in lc.get_partition_at(t): + flow = lc.group_flow(set_name, "+") + r_names = list(flow.keys()) # names of the reference sets + if len(r_names) == 0: + events.append({"src": set_name, "type": "Death"}) + return events + + +def _find_asur_continue_events(lc: object) -> list: + """ + Find continue events in a lifecycle according to Asur et al. + + :param lc: the lifecycle object + :return: dictionary of continue events + """ + events = [] + for t in lc.temporal_ids()[0:-1]: # start from the second time step + for set_name in lc.get_partition_at(t): + flow = lc.group_flow(set_name, "+") + + r_names = list(flow.keys()) # names of the reference sets + for name in r_names: + if lc.get_group(name) == lc.get_group(set_name): + events.append( + { + "src": set_name, + "type": "Continuation", + "ref_set": name, + } + ) + continue + return events + + +def events_asur(lc: object, th: float = 0.5) -> tuple: + """ + Compute the events in a lifecycle according to Asur et al. + Return a dictionary of events of the form {event_type: [event1, event2, ...]} + + :param lc: the lifecycle object + :param th: threshold for Merge and Split scores. Defaults to 0.5. + :return: dictionary of events + + :Reference: + Asur, S., Parthasarathy, S., Ucar, D.: + An event-based framework for characterizing the evolutionary behavior of interaction graphs. + ACM Transactions on Knowledge Discovery from Data (TKDD) 3(4), 1–36 (2009) + """ + merge_evts, merge_flows = _find_asur_merge_events(lc, th) + split_evts, split_flows = _find_asur_split_events(lc, th) + + events = { + "Merge": merge_evts, + "Split": split_evts, + "Birth": _find_asur_birth_events(lc), + "Death": _find_asur_death_events(lc), + "Continuation": _find_asur_continue_events(lc), + } + + flows = { + "Merge": merge_flows, + "Split": split_flows, + } + + return events, flows + + +def event_graph_greene(lc: object, th: float = 0.1) -> tuple: + """ + Compute the event graph in a lifecycle according to Greene et al. + Return a list of match between groups, i.e., edges of the event graph. + + :param lc: the lifecycle object + :param th: threshold for the Jaccard index. Defaults to 0.1 according to best results in the original paper. + :return: list of match between groups + + :Reference: + Greene, D., Doyle, D., Cunningham, P.: Tracking the evolution of communities in dynamic social networks. + In: Proceedings of the 2010 International Conference on Advances in Social Networks Analysis and Mining + (ASONAM 2010), pp. 176–183. IEEE (2010) + + """ + events = [] + flows = [] + for t in lc.temporal_ids()[0:-1]: + for set_name in lc.get_partition_at(t): + target = lc.get_group(set_name) + flow = lc.group_flow(set_name, "+") + r_names = list(flow.keys()) # names of the reference sets + # compute for all pair of reference sets (combinations) + for r in r_names: + merge_score = _greene_merge_score(target, lc.get_group(r)) + if merge_score > th: + events.append({"src": set_name, "type": "Merge", "ref_set": r}) + flows.append( + {"src": set_name, "type": "Merge", "target": r, "flow": flow[r]} + ) + + return {"Merge": events}, {"Merge": flows} diff --git a/cdlib/lifecycles/algorithms/event_analysis.py b/cdlib/lifecycles/algorithms/event_analysis.py new file mode 100644 index 00000000..5f865039 --- /dev/null +++ b/cdlib/lifecycles/algorithms/event_analysis.py @@ -0,0 +1,225 @@ +from cdlib.lifecycles.classes.matching import CommunityMatching +from cdlib.lifecycles.algorithms.measures import * +from cdlib.lifecycles.utils import * + +__all__ = [ + "analyze_all_flows", + "analyze_flow", + "events_all", + "facets", + "event_weights", + "event", + "event_weights_from_flow", +] + + +def _analyze_one_struct(target, reference) -> dict: + # nb reference sets here are already filtered by minimum branch size + + ids_for_entropy = [] + # els_in_branches = set() + for i, r in enumerate(reference): + branch = target.intersection(r) + ids_for_entropy.extend([str(i)] * len(branch)) + # els_in_branches.update(branch) + # newels_ids = [str(j+len(reference)) for j in range(len(target.difference(els_in_branches)))] + # ids_for_entropy.extend(newels_ids) + + return { + "Unicity": facet_unicity(ids_for_entropy), + "Identity": facet_identity(target, reference), + "Outflow": facet_outflow(target, reference), + "size": len(target), + } + + +def _analyze_one_attr(target, reference, attr) -> dict: + mca, pur = purity(target) + try: + ent = _normalized_shannon_entropy(target, base=2) + except ZeroDivisionError: + ent = 0 + + return { + f"{attr}_H": ent, + f"{attr}_H_change": facet_metadata(target, reference, base=2), + f"{attr}_purity": pur, + f"{attr}_mca": mca, + } + + +def event_weights_from_flow(analyzed_flows: dict, direction: str) -> dict: + """ + Compute the event weights of the analyzed flows. + + :param analyzed_flows: the result of the analysis of a flow + :param direction: the temporal direction in which the flow was analyzed + :return: a dictionary containing the event weights + """ + if direction not in ["+", "-"]: + raise ValueError(f"direction must be either '+' or '-'") + res = {} + names = backward_event_names() if direction == "-" else forward_event_names() + for id_, analyzed_flow in analyzed_flows.items(): + scores = _compute_event_scores(analyzed_flow) + res[id_] = dict(zip(names, scores)) + + return res + + +def _compute_event_scores(analyzed_flow: dict) -> list: + return [ + (analyzed_flow["Unicity"]) + * (1 - analyzed_flow["Identity"]) + * analyzed_flow["Outflow"], + (1 - analyzed_flow["Unicity"]) + * (1 - analyzed_flow["Identity"]) + * analyzed_flow["Outflow"], + (analyzed_flow["Unicity"]) + * analyzed_flow["Identity"] + * analyzed_flow["Outflow"], + (1 - analyzed_flow["Unicity"]) + * analyzed_flow["Identity"] + * analyzed_flow["Outflow"], + (analyzed_flow["Unicity"]) + * analyzed_flow["Identity"] + * (1 - analyzed_flow["Outflow"]), + (1 - analyzed_flow["Unicity"]) + * analyzed_flow["Identity"] + * (1 - analyzed_flow["Outflow"]), + (analyzed_flow["Unicity"]) + * (1 - analyzed_flow["Identity"]) + * (1 - analyzed_flow["Outflow"]), + (1 - analyzed_flow["Unicity"]) + * (1 - analyzed_flow["Identity"]) + * (1 - analyzed_flow["Outflow"]), + ] + + +def events_all(lc: CommunityMatching, direction=None) -> dict: + """ + Compute all events for a lifecycle object. + + :param lc: a LifeCycle object + :param direction: the temporal direction in which the events are to be computed + + :return: a dictionary containing the events + + """ + if direction is None: + direction = ["+", "-"] + res = {} + for d in direction: + analyzed_flows = analyze_all_flows(lc, d) + res[d] = event_weights_from_flow(analyzed_flows, d) + return res + + +def analyze_all_flows( + lc: CommunityMatching, direction: str, min_branch_size: int = 1, attr=None +) -> dict: + """ + Analyze the flow of all sets in a LifeCycle object w.r.t. a given temporal direction. + See analyze_flow for more details + :param lc: a LifeCycle object + :param direction: the temporal direction in which the sets are to be analyzed + :param min_branch_size: the minimum number of elements that a branch must contain to be considered + :param attr: the name or list of names of the attribute(s) to analyze. If None, no attribute is analyzed + :return: + """ + last_id = lc.temporal_ids()[-1] if direction == "+" else lc.temporal_ids()[0] + return { + name: analyze_flow( + lc, name, direction, min_branch_size=min_branch_size, attr=attr + ) + for name in lc.named_sets + if not name.split("_")[0] == str(last_id) + } + + +def analyze_flow( + lc: CommunityMatching, + target: str, + direction: str, + min_branch_size=1, + attr: str = None, +) -> dict: + """ + Analyze the flow of a set with respect to a given temporal direction. + Specifically, compute the entropy of the flow, the contribution factor, the difference factor and the set size. + If one of more attributes are specified via the attr parameter, also compute the entropy of the attribute values, + the entropy change, the purity and the most common attribute value. + In case min_branch_size is specified, all branches of the flow that include less than min_branch_size elements are + discarded. + :param lc: a LifeCycle object + :param target: the name of the set to analyze + :param direction: the temporal direction in which the set is to be analyzed + :param min_branch_size: the minimum number of elements that a branch must contain to be considered + :param attr: the name or list of names of the attribute(s) to analyze. If None, no attribute is analyzed + :return: a dictionary containing the analysis results + """ + + flow = lc.group_flow(target, direction=direction, min_branch_size=min_branch_size) + + reference_sets = [lc.get_group(name) for name in flow] + analysis = _analyze_one_struct(lc.get_group(target), reference_sets) + + if attr is not None: + attrs_to_analyze = [attr] if isinstance(attr, str) else attr + for a in attrs_to_analyze: + target_attrs = get_group_attribute_values(lc, target, a) + reference_attrs = [get_group_attribute_values(lc, name, a) for name in flow] + analysis.update(_analyze_one_attr(target_attrs, reference_attrs, a)) + return analysis + + +def facets(lc: CommunityMatching, target: str, direction: str) -> dict: + """ + Compute the unicity, identity, and outflow facets of a target set in a lifecycle object. + Also compute the size of the target set. + + :param lc: a LifeCycle object + :param target: the name of the target set + :param direction: the temporal direction in which the flow is to be analyzed + :return: a dictionary containing the facets + """ + flow = lc.group_flow(target, direction=direction) + + reference_sets = [lc.get_group(name) for name in flow] + facets_ = _analyze_one_struct(lc.get_group(target), reference_sets) + return facets_ + + +def event_weights(lc: CommunityMatching, target: str, direction: str) -> dict: + """ + Compute the event weights of a target set in a lifecycle object. + + :param lc: a LifeCycle object + :param target: the name of the target set + :param direction: the temporal direction in which the flow is to be analyzed + :return: a dictionary containing the event weights + """ + names = backward_event_names() if direction == "-" else forward_event_names() + fscores = facets(lc, target, direction) + res = _compute_event_scores(fscores) + return dict(zip(names, res)) + + +def event(lc, target, direction=None): + """ + Compute the event type and typicality of a target set in a lifecycle. + + :param lc: lifecycle object + :param target: name of the target set + :param direction: temporal direction in which the flow is to be analyzed + :return: a dictionary containing the event type and scores + """ + if direction is None: + direction = ["+", "-"] + back = {} + forward = {} + if "-" in direction: + back = event_typicality(event_weights(lc, target, "-")) + if "+" in direction: + forward = event_typicality(event_weights(lc, target, "+")) + return {"+": forward, "-": back} diff --git a/cdlib/lifecycles/algorithms/measures.py b/cdlib/lifecycles/algorithms/measures.py new file mode 100644 index 00000000..e2aec15a --- /dev/null +++ b/cdlib/lifecycles/algorithms/measures.py @@ -0,0 +1,214 @@ +from collections import Counter +from math import log, e +from typing import Union, Tuple + +import numpy as np +import cdlib.lifecycles.algorithms.event_analysis as ea + +__all__ = [ + "_normalized_shannon_entropy", + "facet_unicity", + "facet_identity", + "facet_outflow", + "facet_metadata", + "purity", + "event_typicality", + "stability", +] + + +def _entropy(labels: list, base=2) -> float: + """ + computes the Shannon entropy of a list of labels + + :param labels: the list of labels + :param base: the base of the logarithm + :return: the set entropy + """ + n = len(labels) + counter = Counter(labels) + probabilities = [count / n for count in counter.values()] + + return -sum(p * log(p, base) for p in probabilities) + + +def _normalized_shannon_entropy(labels, base=2): + """ + the normalized Shannon entropy is the Shannon entropy divided by the maximum possible entropy + (logb(n) where n is the number of labels) + + :param labels: the list of labels + :param base: the base of the logarithm + :return: the normalized Shannon entropy + """ + + # Example of problem: 40,40,1 compared with 40,40 + + base = e if base is None else base + + ent = _entropy(labels, base) + max_ent = log(len(list(set(labels))), base) + # print(ent, max_ent, labels) + + normalized_entropy = ent / max_ent + return normalized_entropy + + +def _max_second_difference(labels): + """ + Function computing the difference between the most frequent attribute value and the + second most frequent attribute value + + Args: + labels (_type_): the list of labels + + Returns: + _type_: _description_ + """ + if len(set(labels)) < 2: + return 1 + n = len(labels) + counter = Counter(labels) + probabilities = [count / n for count in counter.values()] + max_val = max(probabilities) + second_largest = sorted(probabilities)[-2] + return max_val - second_largest + + +def facet_unicity(labels: list) -> float: + """ + the unicity facet quantifies the extent to which a target set comes from one (=1) or multiple (->0) flows. + It is computed as the difference between the largest and the second largest group size + If the target set is composed of a single group, the unicity facet is 1 + + :param labels: the list of group labels + :return: the unicity facet + """ + + if len(set(labels)) < 2: + return 1 + else: + # return gini_index(labels) + # return normalized_shannon_entropy(labels) + # return berger_parker_index(labels) + return _max_second_difference(labels) + + +def facet_identity(target: set, reference: list) -> float: + """ + the identity facet quantifies how much the identity of the target set is shared with the reference groups. + + + :param target: the target set + :param reference: the reference sets + :return: the contribution factor + """ + w = 0 + persistent = 0 + for r in reference: + flow = r.intersection(target) + w += len(flow) * len(flow) / len(r) + # print(len(flow),len(r),len(target),w) + persistent += len(flow) + # denominator=len(target) + if persistent == 0: + return 0.0 + denominator = persistent + w = w / denominator + return w + + +def facet_outflow(target: set, reference: list) -> float: + """ + the outflow facet is the ratio of the number of elements + in the target set that are not in any of the reference sets + + :param target: the target set + :param reference: the reference sets + :return: the difference factor + """ + try: + return len(target.difference(set.union(*reference))) / len(target) + except TypeError: # if reference is empty + return 1.0 + + +def facet_metadata( + target_labels: list, reference_labels: list, base: int = None +) -> Union[float, None]: + """ + compute the change in attribute entropy between a target set and a reference set + + :param target_labels: the labels of the target set + :param reference_labels: the labels of the reference sets (a list of lists) + :param base: the base of the logarithm + :return: the change in attribute entropy + """ + base = e if base is None else base + try: + target_entropy = _normalized_shannon_entropy(target_labels, base) + except ZeroDivisionError: + target_entropy = 0 + + reference_entropy = 0 + if len(reference_labels) > 0: + for labels in reference_labels: + try: + reference_entropy += _normalized_shannon_entropy(labels, base) + except ZeroDivisionError: + continue + + reference_entropy /= len(reference_labels) + else: + return None + return target_entropy - reference_entropy + + +def stability(lc: object, direction: str) -> float: + """ + compute the temporal partition stability. + The stability is the average of the continue events scores. + + :param lc: the lifecycle object + :param direction: the temporal direction + :return: the stability score + + """ + events = ea.events_all(lc) + + res = 0 + if len(events[direction]) == 0: + return 0 + for group, event in events[direction].items(): + res += event["Continue"] + return res / len(events[direction]) + + +def purity(labels: list) -> Tuple[str, float]: + """ + compute the purity of a set of labels. Purity is defined as the relative frequency + of the most frequent attribute value + + :param labels: the list of labels + :return: a tuple of the most frequent attribute value and its frequency + """ + most_common_attribute, freq = Counter(labels).most_common(1)[0] + return most_common_attribute, freq / len(labels) + + +def event_typicality(event_scores: dict) -> Tuple[str, float]: + """ + compute the event's name and its typicality score. + The typicality score is the highest score among all events scores. + + :param event_scores: a dictionary keyed by event name and valued by the event score + :return: a tuple of the event name and its typicality score + + """ + highest_score = 0 + event = "" + for ev, score in event_scores.items(): + if score > highest_score: + highest_score = score + event = ev + return event, highest_score diff --git a/cdlib/lifecycles/algorithms/null_model.py b/cdlib/lifecycles/algorithms/null_model.py new file mode 100644 index 00000000..6975048e --- /dev/null +++ b/cdlib/lifecycles/algorithms/null_model.py @@ -0,0 +1,169 @@ +import random +from collections import Counter, defaultdict +from statistics import mean, stdev + +import scipy.stats as stats + +__all__ = ["flow_null", "all_flows_null"] + + +def _generate_random_branch(reference, size): + """ + Generate a random branch of a given size by sampling elements from the reference partition. + """ + elems = list() + for subset in reference: + elems.extend(subset) + return random.sample(elems, size) + + +def _null_model(branch, reference, iterations): + """ + Generate a null model for a given branch by generating num_permutations random branches of the same size and + computing the mean and standard deviation of the frequency of each element in the reference partition. + """ + null_branch = defaultdict(list) + for _ in range(iterations): + random_branch = _generate_random_branch(reference, len(branch)) + count = Counter(random_branch) + for name, frequency in count.items(): + null_branch[name].append(frequency) + + avg_null_branch = defaultdict(dict) + for name, frequencies in null_branch.items(): + if len(frequencies) == 1: + avg_null_branch[name]["mean"] = frequencies[0] + avg_null_branch[name]["std"] = 0 + else: + avg_null_branch[name]["mean"] = mean(frequencies) + avg_null_branch[name]["std"] = stdev(frequencies) + + return dict(avg_null_branch) + + +def _p_value(size, null_model): + """ + Compute the p-value of a branch given a null model via z-score + """ + + z = (size - null_model["mean"]) / ( + null_model["std"] + 1e-6 + ) # 1e-6 to avoid division by zero + p = stats.norm.sf(abs(z)) + + return p + + +def flow_null( + lc: object, + target: str, + direction: str, + min_branch_size: int = 1, + iterations: int = 1000, +) -> dict: + """ + Compare the flow with a null model. Each branch of each flow is compared with a null branch of the same size. + The null model is generated by randomly sampling elements from the reference partition *iterations* times. + The mean and standard deviation of the null model are used to compute a z-score + for each branch, which is then used to compute a p-value. + + :param lc: a CommunityMatching object + :param target: target set identifier + :param direction: temporal direction + :param min_branch_size: minimum size of a branch to be considered + :param iterations: number of random draws to be used to generate the null model + :return: a dictionary keyed by set identifier and valued by mean, std, and p-value + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from cdlib.lifecycles.algorithms import flow_null + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> validated = flow_null(events, "0_2", "+") + + """ + + flow = lc.group_flow(target, direction, min_branch_size) + tid = int(target.split("_")[0]) + if direction == "+": + tid += 1 + elif direction == "-": + tid -= 1 + else: + raise ValueError(f"Invalid direction: {direction}") + # convert to list of ids lists + reference = [[id_] * len(lc.get_group(id_)) for id_ in lc.get_partition_at(tid)] + + validated = dict() + for name, subset in flow.items(): + null_model = _null_model(subset, reference, iterations)[name] + # mull mean, null std, p-value + validated[name] = { + "mean": null_model["mean"], + "std": null_model["std"], + "p-value": _p_value(len(subset), null_model), + } + return validated + + +def all_flows_null( + lc: object, + direction: str, + min_branch_size=1, + iterations=1000, +): + """ + Compare all flows with null models. See validate_flow for details. + + :param lc: a CommunityOMatching object + :param direction: temporal direction + :param min_branch_size: minimum size of a branch to be considered + :param iterations: number of random draws to be used to generate the null model + :return: a dictionary keyed by set identifier and valued by mean, std, and p-value + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from cdlib.lifecycles.algorithms import all_flows_null + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> validated = all_flows_null(events, "+") + + """ + validated = dict() + for target, flow in lc.all_flows(direction, min_branch_size).items(): + validated[target] = flow_null( + lc, target, direction, min_branch_size, iterations + ) + return validated diff --git a/cdlib/lifecycles/classes/__init__.py b/cdlib/lifecycles/classes/__init__.py new file mode 100644 index 00000000..dc7347ea --- /dev/null +++ b/cdlib/lifecycles/classes/__init__.py @@ -0,0 +1,2 @@ +from .event import * +from .matching import * diff --git a/cdlib/lifecycles/classes/event.py b/cdlib/lifecycles/classes/event.py new file mode 100644 index 00000000..d93c61e5 --- /dev/null +++ b/cdlib/lifecycles/classes/event.py @@ -0,0 +1,959 @@ +from cdlib.classes import TemporalClustering +from cdlib.lifecycles.classes.matching import CommunityMatching +from cdlib.lifecycles.algorithms.null_model import flow_null, all_flows_null +from cdlib.lifecycles.algorithms.event_analysis import ( + events_all, + analyze_all_flows, + analyze_flow, +) +from cdlib.lifecycles.algorithms.classic_match import * +import networkx as nx +from collections import defaultdict +from typing import Callable +import json + + +class CommunityEvent(object): + def __init__(self, com_id): + """ + Constructor + + :param com_id: community id + """ + + self.com_id = com_id + self.from_event = {} + self.to_event = {} + self.in_flow = {} + self.out_flow = {} + + def set_from_event(self, from_event: dict): + """ + Set from event + + :param from_event: from event + """ + self.from_event = {f: v for f, v in from_event.items() if v > 0} + + def set_to_event(self, to_event: dict): + """ + Set to event + + :param to_event: to event + """ + self.to_event = {t: v for t, v in to_event.items() if v > 0} + + def set_in_flow(self, in_flow: dict): + """ + Set in flow + + :param in_flow: in flow + """ + self.in_flow = in_flow + + def set_out_flow(self, out_flow: dict): + """ + Set out flow + + :param out_flow: out flow + """ + self.out_flow = out_flow + + def get_from_event(self) -> dict: + """ + Get from event + + :return: from event + """ + return self.from_event + + def get_to_event(self) -> dict: + """ + Get to event + + :return: to event + """ + return self.to_event + + def get_in_flow(self) -> dict: + """ + Get in flow + + :return: in flow + """ + return self.in_flow + + def get_out_flow(self) -> dict: + """ + Get out flow + + :return: out flow + """ + return self.out_flow + + def to_json(self) -> dict: + """ + Convert the event to json + + :return: the event as json + """ + res = { + "com_id": self.com_id, + "from_event": self.from_event, + "to_event": self.to_event, + "in_flow": {k: list(v) for k, v in self.in_flow.items()}, + "out_flow": {k: list(v) for k, v in self.out_flow.items()}, + } + + return res + + +class LifeCycle(object): + """ + Class representing the lifecycle of a temporal clustering. + It allows to compute the events composing the lifecycle (leveraging different definitions) + and to analyze them starting from a TemporalClustering object. + """ + + def __init__(self, clustering: TemporalClustering = None): + """ + Constructor + + :param clustering: a TemporalClustering Object + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + """ + self.clustering = clustering + self.events = {} + self.event_types = [] + self.cm = CommunityMatching() + if clustering is not None: + self.cm.set_temporal_clustering(self.clustering) + self.algo = None + + def compute_events_from_explicit_matching(self): + """ + Compute the events of the lifecycle using the explicit matching (if available) + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> from dynetx import DynGraph + >>> dg = DynGraph() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> dg.add_interactions_from(g, t) + >>> tc = algorithms.tiles(dg, 10) + >>> events = LifeCycle(tc) + >>> events.compute_events_from_explicit_matching() + """ + if not self.clustering.has_explicit_match(): + raise ValueError("No explicit matching available") + + lifecycle = self.clustering.get_explicit_community_match() + + flows = { + "+": defaultdict(lambda: defaultdict(set)), + "-": defaultdict(lambda: defaultdict(set)), + } + events = { + "+": defaultdict(lambda: defaultdict(set)), + "-": defaultdict(lambda: defaultdict(set)), + } + + for e in lifecycle: + xtid = int(e[0].split("_")[0]) + ytid = int(e[1].split("_")[0]) + if xtid < ytid: + flows["+"][e[0]][e[1]] = set( + self.clustering.get_community(e[0]) + ).intersection(set(self.clustering.get_community(e[1]))) + else: + flows["-"][e[0]][e[1]] = set( + self.clustering.get_community(e[0]) + ).intersection(set(self.clustering.get_community(e[1]))) + + self.__instantiate_events(flows, events) + + def compute_events_with_custom_matching( + self, + method: Callable[[set, set], float], + two_sided: bool = True, + threshold: float = 0.2, + ): + """ + Compute the events of the lifecycle using a custom matching similarity function + + + :param method: a set similarity function with co-domain in [0,1] (e.g., Jaccard) + :param two_sided: boolean. + Whether the match has to be applied only from the past to the future (False) + or even from the future to the past (True, default) + :param threshold: the threshold above which two communities are considered matched + + :Example: + + >>> from cdlib import algorithms + >>> from cdlib import TemporalClustering, LifeCycle + >>> tc = TemporalClustering() + >>> # build the temporal clustering object + >>> evts = LifeCycle(tc) + >>> jaccard = lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)) + >>> evts.compute_events_with_custom_matching(jaccard, two_sided=True, threshold=0.2) + """ + + self.event_types = ["Merge", "Split", "Continuation"] + self.algo = "custom" + lifecycle = [] + + for i in range(self.clustering.current_observation - 1): + c_i = self.clustering.clusterings[i] + c_j = self.clustering.clusterings[i + 1] + for name_i, com_i in c_i.named_communities.items(): + + best_match = [] + best_score = 0 + + for name_j, com_j in c_j.named_communities.items(): + + match = method(com_i, com_j) + if match > best_score: + best_match = [name_j] + best_score = match + elif match == best_score: + best_match.append(name_j) + + for j in best_match: + lifecycle.append((name_i, j, best_score)) + + if two_sided: + + for i in range(self.clustering.current_observation - 1, 0, -1): + c_i = self.clustering.clusterings[i] + c_j = self.clustering.clusterings[i - 1] + + for name_i, com_i in c_i.named_communities.items(): + # name_i = f"{self.obs_to_time[i]}_{cid_i}" + best_match = [] + best_score = 0 + + for name_j, com_j in c_j.named_communities.items(): + # name_j = f"{self.obs_to_time[i-1]}_{cid_j}" + + match = method(com_i, com_j) + if match > best_score: + best_match = [name_j] + best_score = match + elif match == best_score: + best_match.append(name_j) + + for j in best_match: + lifecycle.append((j, name_i, best_score)) + + flows = { + "+": defaultdict(lambda: defaultdict(set)), + "-": defaultdict(lambda: defaultdict(set)), + } + events = { + "+": defaultdict(lambda: defaultdict(set)), + "-": defaultdict(lambda: defaultdict(set)), + } + + for e in lifecycle: + xtid = int(e[0].split("_")[0]) + ytid = int(e[1].split("_")[0]) + if e[2] > threshold: + if xtid < ytid: + flows["+"][e[0]][e[1]] = set( + self.clustering.get_community(e[0]) + ).intersection(set(self.clustering.get_community(e[1]))) + else: + flows["-"][e[0]][e[1]] = set( + self.clustering.get_community(e[0]) + ).intersection(set(self.clustering.get_community(e[1]))) + + self.__instantiate_events(flows, events) + + def __instantiate_events(self, flows, events): + for e in flows["-"]: + if len(flows["-"][e].keys()) == 1: + events["-"][e] = {"Continuation": 1} + else: + events["-"][e] = {"Merge": 1} + + for e in flows["+"]: + if len(flows["+"][e].keys()) == 1: + events["+"][e] = {"Continuation": 1} + else: + events["+"][e] = {"Split": 1} + + for cid in flows["+"]: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_out_flow(flows["+"][cid]) + + for cid in flows["-"]: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_in_flow(flows["-"][cid]) + + from_events = events["-"] + to_events = events["+"] + + for cid in from_events: + self.events[cid].set_from_event( + {k: v for k, v in from_events[cid].items() if v > 0} + ) + + for cid in to_events: + self.events[cid].set_to_event( + {k: v for k, v in to_events[cid].items() if v > 0} + ) + + def compute_events( + self, + matching_type: str = "facets", + matching_params: dict = {"min_branch_size": 1, "threshold": 0.5}, + ): + """ + Compute the events of the lifecycle + + :param matching_type: the type of matching algorithm to use. Options are "facets", "asur", "greene". + :param matching_params: the parameters of the matching algorithm. + Defaults to {"min_branch_size": 1, "threshold": 0.5}. + The former parameter is required for "facets", the latter by "asur" and "greene". + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + + """ + + if matching_type == "facets": + + self.algo = "facets" + + self.event_types = [ + "Birth", + "Accumulation", + "Growth", + "Expansion", + "Continuation", + "Merge", + "Offspring", + "Reorganization", + "Death", + "Dispersion", + "Shrink", + "Reduction", + "Continuation", + "Split", + "Ancestor", + "Disassemble", + ] + + out_flows = self.cm.all_flows( + "+", min_branch_size=matching_params["min_branch_size"] + ) + + for cid in out_flows: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_out_flow(out_flows[cid]) + + in_flows = self.cm.all_flows( + "-", min_branch_size=matching_params["min_branch_size"] + ) + + for cid in in_flows: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_in_flow(in_flows[cid]) + + events = events_all(self.cm) + from_events = events["-"] + to_events = events["+"] + + for cid in from_events: + self.events[cid].set_from_event( + {k: v for k, v in from_events[cid].items() if v > 0} + ) + + for cid in to_events: + self.events[cid].set_to_event( + {k: v for k, v in to_events[cid].items() if v > 0} + ) + + elif matching_type == "asur": + + self.algo = "asur" + + self.event_types = ["Merge", "Split", "Continuation", "Birth", "Death"] + + events, flows = events_asur(self.cm, th=matching_params["threshold"]) + + c_to_evt = defaultdict(lambda: defaultdict(int)) + c_from_evt = defaultdict(lambda: defaultdict(int)) + for _, v in events.items(): + + for e in v: + src_tid = int(e["src"].split("_")[0]) + if "ref_sets" in e: + trg_tid = int(e["ref_sets"][0].split("_")[0]) + else: + trg_tid = int(e["ref_set"].split("_")[0]) + + if src_tid < trg_tid: + c_to_evt[e["src"]][e["type"]] += 1 + else: + c_from_evt[e["src"]][e["type"]] += 1 + + c_from_evt = { + k: {k2: v2 / sum(v.values()) for k2, v2 in v.items() if v2 > 0} + for k, v in c_from_evt.items() + } + c_to_evt = { + k: {k2: v2 / sum(v.values()) for k2, v2 in v.items() if v2 > 0} + for k, v in c_to_evt.items() + } + + c_from_flow = defaultdict(lambda: defaultdict(list)) + c_to_flow = defaultdict(lambda: defaultdict(list)) + + for _, v in flows.items(): + for e in v: + src_tid = int(e["src"].split("_")[0]) + trg_tid = int(e["target"].split("_")[0]) + + if src_tid < trg_tid: + c_from_flow[e["src"]][e["target"]] = e["flow"] + else: + c_to_flow[e["src"]][e["target"]] = e["flow"] + + for cid in c_to_flow: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_in_flow(c_to_flow[cid]) + + for cid in c_from_flow: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_out_flow(c_from_flow[cid]) + + for cid in c_to_evt: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_to_event( + {k: v for k, v in c_to_evt[cid].items() if v > 0} + ) + + for cid in c_from_evt: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_from_event( + {k: v for k, v in c_from_evt[cid].items() if v > 0} + ) + + elif matching_type == "greene": + + self.algo = "greene" + + self.event_types = ["Merge"] + + events, flow = event_graph_greene(self.cm, th=matching_params["threshold"]) + c_to_evt = defaultdict(lambda: defaultdict(int)) + c_from_evt = defaultdict(lambda: defaultdict(int)) + for _, v in events.items(): + + for e in v: + src_tid = int(e["src"].split("_")[0]) + if "ref_sets" in e: + trg_tid = int(e["ref_sets"][0].split("_")[0]) + else: + trg_tid = int(e["ref_set"].split("_")[0]) + + if src_tid < trg_tid: + c_to_evt[e["src"]][e["type"]] += 1 + else: + c_from_evt[e["src"]][e["type"]] += 1 + + for cid in flow: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_in_flow(flow[cid]) + + for cid in c_to_evt: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_to_event( + {k: v for k, v in c_to_evt[cid].items() if v > 0} + ) + + for cid in c_from_evt: + if cid not in self.events: + self.events[cid] = CommunityEvent(cid) + self.events[cid].set_from_event( + {k: v for k, v in c_from_evt[cid].items() if v > 0} + ) + + else: + raise ValueError(f"Unknown matching type {matching_type}") + + def get_event(self, com_id: str) -> CommunityEvent: + """ + Get the events associated to a community + + :param com_id: the community id + :return: the events associated to the community + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> evt = events.get_event("0_2") + + """ + return self.events.get(com_id) + + def get_events(self) -> dict: + """ + Get all the events + + :return: the events + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> evts = events.get_events() + """ + return self.events + + def get_event_types(self) -> list: + """ + Get the event types + + :return: the event types + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> evts = events.get_event_types() + """ + return self.event_types + + def analyze_flows( + self, direction: str = "+", min_branch_size: int = 1, attr=None + ) -> dict: + """ + Analyze the flows of the lifecycle + + :param direction: the temporal direction in which the flows are to be analyzed. Options are "+" and "-". + :param min_branch_size: the minimum branch size + :param attr: the attribute to analyze + :return: the analyzed flows + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> c = events.analyze_flows("+") + + """ + if self.cm is not None: + return analyze_all_flows(self.cm, direction, min_branch_size, attr) + else: + raise ValueError("No temporal clustering set") + + def analyze_flow( + self, com_id: str, direction: str = "+", min_branch_size: int = 1, attr=None + ) -> dict: + """ + Analyze the flow of a community + + :param com_id: the community id + :param direction: the temporal direction in which the flow is to be analyzed. Options are "+" and "-". + :param min_branch_size: the minimum branch size + :param attr: the attribute to analyze + :return: the analyzed flow + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + """ + if self.cm is not None: + return analyze_flow(self.cm, com_id, direction, min_branch_size, attr) + else: + raise ValueError("No temporal clustering set") + + def set_attribute(self, attr: dict, name: str): + """ + Set the attributes of the lifecycle + + :param attr: the attributes + :param name: the name of the attribute + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> import random + >>> from networkx.generators.community import LFR_benchmark_graph + >>> + >>> def random_attributes(): + >>> attrs = {} + >>> for i in range(250): + >>> attrs[i] = {} + >>> for t in range(10): + >>> attrs[i][t] = random.choice(["A", "B", "C", "D", "E"]) + >>> return attrs + >>> + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> events.set_attribute(random_attributes(), "fakeattribute") + + """ + if self.cm is not None: + self.cm.set_attributes(attr, name) + else: + raise ValueError("No temporal clustering set") + + def get_attribute(self, name: str) -> dict: + """ + Get the attributes associated to the nodes + + :param name: the name of the attribute + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> import random + >>> from networkx.generators.community import LFR_benchmark_graph + >>> + >>> def random_attributes(): + >>> attrs = {} + >>> for i in range(250): + >>> attrs[i] = {} + >>> for t in range(10): + >>> attrs[i][t] = random.choice(["A", "B", "C", "D", "E"]) + >>> return attrs + >>> + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> events.set_attribute(random_attributes(), "fakeattribute") + >>> attrs = events.get_attribute("fakeattribute") + """ + if self.cm is not None: + return self.cm.get_attributes(name) + else: + raise ValueError("No temporal clustering set") + + def polytree(self) -> nx.DiGraph: + """ + Reconstruct the poly-tree representing communities lifecycles using a provided similarity function. + + :return: a networkx DiGraph object. + Nodes represent communities, their ids are assigned following the pattern {tid}_{cid}, + where tid is the time of observation and + cid is the position of the community within the Clustering object. + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> g = events.polytree() + """ + + g = nx.DiGraph() + for e in self.events: + evt = self.events[e] + for f in evt.get_in_flow(): + g.add_edge(f, e) + for t in evt.get_out_flow(): + g.add_edge(e, t) + + return g + + def validate_flow( + self, + target: str, + direction: str, + min_branch_size: int = 1, + iterations: int = 1000, + ) -> dict: + """ + Compare the flow with a null model. Each branch of each flow is compared with a null branch of the same size. + The null model is generated by randomly sampling elements from the reference partition *iterations* times. + The mean and standard deviation of the null model are used to compute a z-score + for each branch, which is then used to compute a p-value. + + :param target: target set identifier + :param direction: temporal direction, either "+" (out flow) or "-" (in flow) + :param min_branch_size: minimum size of a branch to be considered + :param iterations: number of random draws to be used to generate the null model + :return: + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> cf = events.validate_flow("0_2", "+") + """ + return flow_null(self.cm, target, direction, min_branch_size, iterations) + + def validate_all_flows( + self, direction: str, min_branch_size=1, iterations=1000 + ) -> dict: + """ + Compare all flows with null models. See validate_flow for details. + + :param direction: temporal direction, either "+" (out flow) or "-" (in flow) + :param min_branch_size: minimum size of a branch to be considered + :param iterations: number of random draws to be used to generate the null model + :return: a dictionary keyed by set identifier and valued by mean, std, and p-value + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> vf = events.validate_all_flows("+") + """ + return all_flows_null(self.cm, direction, min_branch_size, iterations) + + def to_json(self) -> dict: + """ + Convert the lifecycle to json + + :return: the lifecycle as json + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> events.to_json() + """ + res = { + "algorithm": self.algo, + "events": {k: v.to_json() for k, v in self.events.items()}, + "event_types": list(self.event_types), + } + + return res diff --git a/cdlib/lifecycles/classes/matching.py b/cdlib/lifecycles/classes/matching.py new file mode 100644 index 00000000..9d6944c7 --- /dev/null +++ b/cdlib/lifecycles/classes/matching.py @@ -0,0 +1,268 @@ +import json +from collections import defaultdict + +__all__ = ["CommunityMatching"] + + +class CommunityMatching(object): + """ + A class to represent and analyze temporally-evolving groups. + """ + + def __init__(self, dtype: type = int) -> None: + + self.dtype = dtype + self.tids = [] + self.named_sets = defaultdict(set) + self.tid_to_named_sets = defaultdict(list) + self.attributes = defaultdict(dict) + + # Convenience get methods + def temporal_ids(self) -> list: + """ + retrieve the temporal ids of the CommunityMatching. + Temporal ids are integers that represent the observation time of a partition. + """ + return self.tids + + def slice(self, start: int, end: int) -> object: + """ + slice the CommunityMatching to keep only a given interval + + :param start: the start of the interval + :param end: the end of the interval + :return: a new CommunityMatching object + """ + temp = CommunityMatching(self.dtype) + temp.tids = self.tids[start:end] + temp.named_sets = { + k: v + for k, v in self.named_sets.items() + if int(k.split("_")[0]) in temp.tids + } + temp.tid_to_named_sets = { + k: v for k, v in self.tid_to_named_sets.items() if int(k) in temp.tids + } + temp_attrs = {} + for attr_name, attr in self.attributes.items(): + temp_attrs[attr_name] = {k: v for k, v in attr.items() if k in temp.tids} + temp.attributes = temp_attrs + return temp + + def universe_set(self) -> set: + """ + retrieve the universe set. + The universe set is the union of all sets in the CommunityMatching + + :return: the universe set + """ + universe = set() + for set_ in self.named_sets.values(): + universe = universe.union(set_) + return universe + + def groups_ids(self) -> list: + """ + retrieve the group ids of the CommunityMatching. Each id is of the form 'tid_gid' where tid is the temporal id + and gid is the group id. The group id is a unique identifier of the group within the temporal id. + + :return: a list of ids of the temporal groups + """ + return list(self.named_sets.keys()) + + # Partition methods + def __add_partition(self, partition: list) -> None: + """ + add a partition to the CommunityMatching. A partition is a list of sets observed at a given time instant. Each + partition will be assigned a unique id (tid) corresponding to the observation time, and each set in the + partition will be assigned a unique name + + :param partition: a collection of sets + :return: None + """ + + tid = len(self.tids) + self.tids.append(tid) + + for i, group in enumerate(partition): + name = str(tid) + "_" + str(i) + self.tid_to_named_sets[str(tid)].append(name) + + if self.dtype in [int, float, str]: + try: + self.named_sets[name] = set(group) + except TypeError: # group is not iterable (only 1 elem) + tmp = set() + tmp.add(group) + self.named_sets[name] = tmp + else: + raise NotImplementedError("dtype not supported") + + def set_temporal_clustering(self, partitions: object) -> None: + """ + add multiple partitions to the CommunityMatching. + + :param partitions: a list of partitions + :return: None + """ + tids = partitions.get_observation_ids() + for t in tids: + self.__add_partition(partitions.get_clustering_at(t).communities) + + def get_partition_at(self, tid: int) -> list: + """ + retrieve a partition by id + + :param tid: the id of the partition to retrieve + :return: the partition corresponding to the given id + """ + if str(tid) not in self.tid_to_named_sets: + return [] + return self.tid_to_named_sets[str(tid)] + + # Attribute methods + def set_attributes(self, attributes: dict, attr_name: str) -> None: + """ + set the temporal attributes of the elements in the CommunityMatching + + The temporal attributes must be provided as a dictionary keyed by the element id and valued by a dictionary + keyed by the temporal id and valued by the attribute value. + + :param attr_name: the name of the attribute + :param attributes: a dictionary of temporal attributes + :return: None + """ + self.attributes[attr_name] = attributes + + def get_attributes(self, attr_name, of=None) -> dict: + """ + retrieve the temporal attributes of the CommunityMatching + + :param attr_name: the name of the attribute + :param of: the element for which to retrieve the attributes. If None, all attributes are returned + + :return: a dictionary keyed by element id and valued by a dictionary keyed by temporal id and valued + by the attribute value + """ + if of is None: + return self.attributes[attr_name] + else: + return self.attributes[attr_name][of] + + # Set methods + def get_group(self, gid: str) -> set: + """ + retrieve a group by id + + :param gid: the name of the group to retrieve + :return: the group corresponding to the given name + """ + return self.named_sets[gid] + + def group_iterator(self, tid: int = None) -> iter: + """ + returns an iterator over the groups of the CommunityMatching. + if a temporal id is provided, it will iterate over the groups observed at that time instant + + :param tid: the temporal id of the groups to iterate over. Default is None + :return: an iterator over the groups + """ + if tid is None: + yield from self.named_sets.values() + else: + for name in self.get_partition_at(tid): + yield self.named_sets[name] + + def filter_on_group_size(self, min_size: int = 1, max_size: int = None) -> None: + """ + remove groups that do not meet the size criteria + + :param min_size: the minimum size of the groups to keep + :param max_size: the maximum size of the groups to keep + :return: None + """ + + if max_size is None: + max_size = len(self.universe_set()) + + for name, set_ in self.named_sets.copy().items(): + if len(set_) < min_size or len(set_) > max_size: + del self.named_sets[name] + self.tid_to_named_sets[name.split("_")[0]].remove(name) + + # Element-centric methods + def get_element_membership(self, element: object) -> list: + """ + retrieve the list of sets that contain a given element + + :param element: the element for which to retrieve the memberships + :return: a list of set names that contain the given element + """ + + memberships = list() + for name, set_ in self.named_sets.items(): + if element in set_: + memberships.append(name) + return memberships + + def get_all_element_memberships(self) -> dict: + """ + retrieve the list of sets that contain each element in the CommunityMatching + + :return: a dictionary keyed by element and valued by a list of set names that contain the element + """ + + memberships = defaultdict(list) + + for element in self.universe_set(): + for name, set_ in self.named_sets.items(): + if element in set_: + memberships[element].append(name) + + return memberships + + # Flow methods + def group_flow(self, target: str, direction: str, min_branch_size: int = 1) -> dict: + """ + compute the flow of a group w.r.t. a given temporal direction. The flow of a group is the collection of groups + that contain at least one element of the target group, Returns a dictionary keyed by group name and valued by + the intersection of the target group and the group corresponding to the key. + + :param target: the name of the group to analyze + :param direction: the temporal direction in which the group is to be analyzed + :param min_branch_size: the minimum size of the intersection between the target group and the group + :return: a dictionary keyed by group name and valued by the intersection of the target group and the group + """ + flow = dict() + tid = int(target.split("_")[0]) + if direction == "+": + ref_tid = tid + 1 + elif direction == "-": + ref_tid = tid - 1 + else: + raise ValueError("direction must either be + or -") + reference = self.get_partition_at(ref_tid) + target_set = self.get_group(target) + + for name in reference: + set_ = self.get_group(name) + branch = target_set.intersection(set_) + if len(branch) >= min_branch_size: + flow[name] = branch + return flow + + def all_flows(self, direction: str, min_branch_size: int = 1) -> dict: + """ + compute the flow of all groups w.r.t. a given temporal direction + + :param direction: the temporal direction in which the sets are to be analyzed + :param min_branch_size: the minimum size of a branch to be considered + :return: a dictionary keyed by group name and valued by the flow of the group + """ + all_flows = dict() + for name in self.named_sets: + all_flows[name] = self.group_flow( + name, direction, min_branch_size=min_branch_size + ) + + return all_flows diff --git a/cdlib/lifecycles/utils/__init__.py b/cdlib/lifecycles/utils/__init__.py new file mode 100644 index 00000000..16281fe0 --- /dev/null +++ b/cdlib/lifecycles/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/cdlib/lifecycles/utils/utils.py b/cdlib/lifecycles/utils/utils.py new file mode 100644 index 00000000..cbb751c9 --- /dev/null +++ b/cdlib/lifecycles/utils/utils.py @@ -0,0 +1,81 @@ +__all__ = [ + "backward_event_names", + "forward_event_names", + "colormap", + "get_group_attribute_values", +] + + +def backward_event_names() -> list: + """ + return the list of backward event names + """ + return [ + "Birth", + "Accumulation", + "Growth", + "Expansion", + "Continuation", + "Merge", + "Offspring", + "Reorganization", + ] + + +def forward_event_names() -> list: + """ + return the list of forward event names + """ + return [ + "Death", + "Dispersion", + "Shrink", + "Reduction", + "Continuation", + "Split", + "Ancestor", + "Disassemble", + ] + + +def colormap() -> dict: + """ + return a dictionary of colors for each event type. + this is used to color the events in the visualization + """ + + return { + "Birth": " #808000", + "Accumulation": "#4CC89F", + "Growth": "#929292", + "Expansion": "#5C5C5C", + "Continuation": "#CFBAE1", + "Merge": "#E34856", + "Offspring": "#0DAAE9", + "Reorganization": "#FFA500", + "Death": " #808000", + "Dispersion": "#4CC89F", + "Shrink": "#929292", + "Reduction": "#5C5C5C", + "Split": "#E34856", + "Ancestor": "#0DAAE9", + "Disassemble": "#FFA500", + } + + +def get_group_attribute_values(lc: object, target: str, attr_name: str) -> list: + """ + retrieve the list of attributes of the elements in a set + + :param lc: a LifeCycle object + :param target: the id of the set + :param attr_name: the name of the attribute + :return: a list of attributes corresponding to the elements in the set + """ + + tid = int(target.split("_")[0]) + attributes = list() + + for elem in lc.get_group(target): + attributes.append(lc.get_attributes(attr_name, of=elem)[tid]) + return attributes diff --git a/cdlib/readwrite/io.py b/cdlib/readwrite/io.py index f6531adf..72bcbd7a 100644 --- a/cdlib/readwrite/io.py +++ b/cdlib/readwrite/io.py @@ -1,4 +1,10 @@ -from cdlib import NodeClustering, FuzzyNodeClustering, EdgeClustering +from cdlib import ( + NodeClustering, + FuzzyNodeClustering, + EdgeClustering, + LifeCycle, + CommunityEvent, +) import json import gzip @@ -8,6 +14,8 @@ "write_community_json", "read_community_json", "read_community_from_json_string", + "write_lifecycle_json", + "read_lifecycle_json", ] @@ -187,9 +195,6 @@ def read_community_from_json_string(json_repr: str) -> object: >>> g = nx.karate_club_graph() >>> coms = algorithms.louvain(g) >>> readwrite.write_community_json(coms, "communities.json") - >>> with open("community.json") as f: - >>> cr = f.read() - >>> readwrite.write_community_from_json_string(cr) """ coms = json.loads(json_repr) @@ -218,3 +223,108 @@ def read_community_from_json_string(json_repr: str) -> object: nc.__class__ = EdgeClustering return nc + + +def write_lifecycle_json(lifecycle: LifeCycle, path: str, compress: bool = False): + """ + Save lifecycle structure to JSON file. + + :param lifecycle: a LifeCycle object + :param path: output filename + :param compress: wheter to copress the JSON, default False + :return: a JSON formatted string representing the object + + :Example: + + >>> from cdlib import LifeCycle, TemporalClustering + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> from cdlib.readwrite import write_lifecycle_json, read_lifecycle_json + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> write_lifecycle_json(events, "lifecycle.json") + """ + + repr_ = lifecycle.to_json() + js_dmp = json.dumps(repr_) + + if compress: + op = gzip.open + else: + op = open + + with op(path, "wt") as f: + f.write(js_dmp) + + +def read_lifecycle_json(path: str, compress: bool = False) -> object: + """ + Read lifecycle from JSON file. + + :param path: input filename + :param compress: wheter the file is in a copress format, default False + :return: a LifeCycle object + + :Example: + + >>> from cdlib import LifeCycle, TemporalClustering + >>> from cdlib import algorithms + >>> from networkx.generators.community import LFR_benchmark_graph + >>> from cdlib.readwrite import write_lifecycle_json, read_lifecycle_json + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> write_lifecycle_json(events, "lifecycle.json") + >>> events = read_lifecycle_json("lifecycle.json") + + """ + + if compress: + op = gzip.open + else: + op = open + + with op(path, "rt") as f: + repr_ = json.load(f) + + lc = LifeCycle() + + lc.event_types = repr_["event_types"] + lc.algo = repr_["algorithm"] + + for e in repr_["events"]: + evt = CommunityEvent(e) + evt.from_event = repr_["events"][e]["from_event"] + evt.to_event = repr_["events"][e]["to_event"] + evt.in_flow = repr_["events"][e]["in_flow"] + evt.out_flow = repr_["events"][e]["out_flow"] + lc.events[e] = evt + + return lc diff --git a/cdlib/test/test_events.py b/cdlib/test/test_events.py new file mode 100644 index 00000000..3676bd6f --- /dev/null +++ b/cdlib/test/test_events.py @@ -0,0 +1,269 @@ +import unittest +import cdlib +from cdlib import algorithms +from cdlib import LifeCycle +from cdlib import TemporalClustering +from cdlib.lifecycles.algorithms.event_analysis import ( + facets, + event_weights, + event as evn, +) +from plotly import graph_objects as go +import networkx as nx +from networkx.generators.community import LFR_benchmark_graph +import matplotlib.pyplot as plt +import dynetx as dn +import os +from cdlib.viz import ( + plot_flow, + plot_event_radar, + plot_event_radars, + typicality_distribution, +) + + +class EventTest(unittest.TestCase): + def test_creation(self): + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + + c = events.analyze_flows("+") + self.assertIsInstance(c, dict) + c = events.analyze_flow("0_2", "+") + self.assertIsInstance(c, dict) + + events = LifeCycle(tc) + events.compute_events("asur") + + c = events.analyze_flows("+") + self.assertIsInstance(c, dict) + c = events.analyze_flow("0_2", "+") + self.assertIsInstance(c, dict) + + events = LifeCycle(tc) + events.compute_events("greene") + + c = events.analyze_flows("+") + self.assertIsInstance(c, dict) + + c = events.analyze_flow("0_2", "+") + self.assertIsInstance(c, dict) + + def test_custom_matching(self): + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + jaccard = lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)) + events.compute_events_with_custom_matching(jaccard, two_sided=True) + c = events.analyze_flows("+") + self.assertIsInstance(c, dict) + + events.compute_events_with_custom_matching( + jaccard, two_sided=False, threshold=0 + ) + c = events.analyze_flows("+") + self.assertIsInstance(c, dict) + + def test_polytree(self): + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + g = events.polytree() + self.assertIsInstance(g, nx.DiGraph) + + def test_null_model(self): + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + cf = events.validate_flow("0_2", "+") + self.assertIsInstance(cf, dict) + + vf = events.validate_all_flows("+") + self.assertIsInstance(vf, dict) + + def test_viz(self): + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + + fig = plot_flow(events) + self.assertIsInstance(fig, go.Figure) + + plot_event_radar(events, "0_2", direction="+") + plt.savefig("radar.pdf") + os.remove("radar.pdf") + + plot_event_radars(events, "0_2") + plt.savefig("radars.pdf") + os.remove("radars.pdf") + + typicality_distribution(events, "+") + plt.savefig("td.pdf") + os.remove("td.pdf") + + def test_explicit(self): + + dg = dn.DynGraph() + for x in range(10): + g = nx.erdos_renyi_graph(200, 0.05) + dg.add_interactions_from(list(g.edges()), t=x) + coms = algorithms.tiles(dg, 2) + + events = LifeCycle(coms) + events.compute_events_from_explicit_matching() + + c = events.analyze_flows("+") + self.assertIsInstance(c, dict) + + def test_node_attributes(self): + import random + + def random_attributes(): + attrs = {} + for i in range(250): + attrs[i] = {} + for t in range(10): + attrs[i][t] = random.choice(["A", "B", "C", "D", "E"]) + return attrs + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + events.set_attribute(random_attributes(), "fakeattribute") + attrs = events.get_attribute("fakeattribute") + self.assertIsInstance(attrs, dict) + + events.analyze_flow("1_1", "+", attr="fakeattribute") + self.assertIsInstance(attrs, dict) + + ev = events.get_event("1_1") + a = ev.out_flow # to get the out flow of the community 1_2 + self.assertIsInstance(a, dict) + a = ev.in_flow # to get the in flow of the community 1_2 + self.assertIsInstance(a, dict) + a = ev.from_event # to get the from events of the community 1_2 + self.assertIsInstance(a, dict) + a = ev.to_event # to get the to events of the community 1_2 + self.assertIsInstance(a, dict) + + def test_marginal(self): + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + + # marginal tests (not all methods are tested since they are not of use in cdlib - + # they are invoked for completeness) + self.assertIsInstance( + events.cm.slice(0, 5), cdlib.lifecycles.classes.matching.CommunityMatching + ) + self.assertIsInstance(events.cm.universe_set(), set) + self.assertIsInstance(list(events.cm.group_iterator()), list) + self.assertIsInstance(list(events.cm.group_iterator(3)), list) + events.cm.filter_on_group_size(1, 100) + events.cm.get_element_membership(1) + events.cm.get_all_element_memberships() + events.get_events() + events.get_event_types() + ev = events.get_event("1_1") + ev.get_from_event() + ev.get_to_event() + facets((events.cm), "0_2", "+") + event_weights(events.cm, "0_2", "+") + evn(events.cm, "0_2", "+") + + +if __name__ == "__main__": + unittest.main() diff --git a/cdlib/test/test_io.py b/cdlib/test/test_io.py index dc5ff44f..5cbfcd9b 100644 --- a/cdlib/test/test_io.py +++ b/cdlib/test/test_io.py @@ -1,6 +1,10 @@ import unittest -from cdlib import algorithms + from cdlib import readwrite +from cdlib import LifeCycle, TemporalClustering +from cdlib import algorithms +from networkx.generators.community import LFR_benchmark_graph +from cdlib.readwrite import write_lifecycle_json, read_lifecycle_json import networkx as nx import os @@ -51,3 +55,26 @@ def test_read_write_json(self): cr = f.read() readwrite.read_community_from_json_string(cr) os.remove("coms.json") + + def test_events_read_write(self): + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") + write_lifecycle_json(events, "lifecycle.json") + e = read_lifecycle_json("lifecycle.json") + self.assertIsInstance(e, LifeCycle) + os.remove("lifecycle.json") diff --git a/cdlib/test/test_nodeclustering.py b/cdlib/test/test_nodeclustering.py index ab375a9a..f6010d67 100644 --- a/cdlib/test/test_nodeclustering.py +++ b/cdlib/test/test_nodeclustering.py @@ -77,3 +77,20 @@ def test_comparison(self): self.assertIsInstance(coms.adjusted_mutual_information(coms2).score, float) self.assertIsInstance(coms.adjusted_rand_index(coms2).score, float) self.assertIsInstance(coms.variation_of_information(coms2).score, float) + + self.assertIsInstance(coms.partition_closeness_simple(coms2).score, float) + self.assertIsInstance(coms.ecs(coms2).score, float) + self.assertIsInstance(coms.jaccard_index(coms2).score, float) + self.assertIsInstance(coms.rand_index(coms2).score, float) + self.assertIsInstance(coms.fowlkes_mallows_index(coms2).score, float) + self.assertIsInstance(coms.classification_error(coms2).score, float) + self.assertIsInstance(coms.czekanowski_index(coms2).score, float) + self.assertIsInstance(coms.dice_index(coms2).score, float) + self.assertIsInstance(coms.sorensen_index(coms2).score, float) + self.assertIsInstance(coms.rogers_tanimoto_index(coms2).score, float) + self.assertIsInstance(coms.southwood_index(coms2).score, float) + self.assertIsInstance(coms.mi(coms2).score, float) + self.assertIsInstance(coms.rmi(coms2).score, float) + self.assertIsInstance(coms.geometric_accuracy(coms2).score, float) + self.assertIsInstance(coms.overlap_quality(coms2).score, float) + self.assertIsInstance(coms.sample_expected_sim(coms2).score, float) diff --git a/cdlib/test/test_partitions_comparisons.py b/cdlib/test/test_partitions_comparisons.py index d2ca335a..4fff4b01 100644 --- a/cdlib/test/test_partitions_comparisons.py +++ b/cdlib/test/test_partitions_comparisons.py @@ -126,108 +126,76 @@ def test_clusim(self): louvain_communities = louvain(g) lp_communities = label_propagation(g) - score = evaluation.ecs( - louvain_communities, lp_communities - ) + score = evaluation.ecs(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.jaccard_index( - louvain_communities, lp_communities - ) + score = evaluation.jaccard_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.rand_index( - louvain_communities, lp_communities - ) + score = evaluation.rand_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.fowlkes_mallows_index( - louvain_communities, lp_communities - ) + score = evaluation.fowlkes_mallows_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.classification_error( - louvain_communities, lp_communities - ) + score = evaluation.classification_error(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.czekanowski_index( - louvain_communities, lp_communities - ) + score = evaluation.czekanowski_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.dice_index( - louvain_communities, lp_communities - ) + score = evaluation.dice_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.sorensen_index( - louvain_communities, lp_communities - ) + score = evaluation.sorensen_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.rogers_tanimoto_index( - louvain_communities, lp_communities - ) + score = evaluation.rogers_tanimoto_index(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.southwood_index( - louvain_communities, lp_communities - ) + score = evaluation.southwood_index(louvain_communities, lp_communities) self.assertGreaterEqual(score.score, 0) - score = evaluation.mi( - louvain_communities, lp_communities - ) + score = evaluation.mi(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.rmi( - louvain_communities, lp_communities - ) + score = evaluation.rmi(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.geometric_accuracy( - louvain_communities, lp_communities - ) + score = evaluation.geometric_accuracy(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.overlap_quality( - louvain_communities, lp_communities - ) + score = evaluation.overlap_quality(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - score = evaluation.sample_expected_sim( - louvain_communities, lp_communities - ) + score = evaluation.sample_expected_sim(louvain_communities, lp_communities) self.assertLessEqual(score.score, 1) self.assertGreaterEqual(score.score, 0) - - diff --git a/cdlib/test/test_temporal_clustering.py b/cdlib/test/test_temporal_clustering.py index ae7f7f10..7777751b 100644 --- a/cdlib/test/test_temporal_clustering.py +++ b/cdlib/test/test_temporal_clustering.py @@ -40,37 +40,11 @@ def test_stability(self): trend = tc.clustering_stability_trend(evaluation.normalized_mutual_information) self.assertEqual(len(trend), len(tc.get_observation_ids()) - 1) - def test_matching(self): - tc = get_temporal_network_clustering() - matches = tc.community_matching( - lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)), False - ) - self.assertIsInstance(matches, list) - self.assertIsInstance(matches[0], tuple) - self.assertEqual(len(matches[0]), 3) - - matches = tc.community_matching( - lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)), True - ) - self.assertIsInstance(matches, list) - self.assertIsInstance(matches[0], tuple) - self.assertEqual(len(matches[0]), 3) - - def test_lifecycle(self): - tc = get_temporal_network_clustering() - pt = tc.lifecycle_polytree( - lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)), True - ) - self.assertIsInstance(pt, nx.DiGraph) - def test_community_access(self): tc = get_temporal_network_clustering() - pt = tc.lifecycle_polytree( - lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)), True - ) - for cid in pt.nodes(): - com = tc.get_community(cid) - self.assertIsInstance(com, list) + + com = tc.get_community("0_0") + self.assertIsInstance(com, list) def test_to_json(self): tc = get_temporal_network_clustering() @@ -78,12 +52,3 @@ def test_to_json(self): self.assertIsInstance(js, str) res = json.loads(js) self.assertIsNone(res["matchings"]) - - tc = get_temporal_network_clustering() - tc.lifecycle_polytree( - lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)), True - ) - js = tc.to_json() - self.assertIsInstance(js, str) - res = json.loads(js) - self.assertIsNotNone(res["matchings"]) diff --git a/cdlib/test/test_viz_network.py b/cdlib/test/test_viz_network.py index f2d05375..708ffe27 100644 --- a/cdlib/test/test_viz_network.py +++ b/cdlib/test/test_viz_network.py @@ -56,16 +56,16 @@ def test_community_graph(self): os.remove("cg.pdf") def test_highlighted_clusters(self): - + g = nx.karate_club_graph() coms = algorithms.louvain(g) viz.plot_network_highlighted_clusters(g, coms) - + plt.savefig("highlighted_clusters.pdf") os.remove("highlighted_clusters.pdf") - + coms = algorithms.demon(g, 0.25) viz.plot_network_highlighted_clusters(g, coms) - + plt.savefig("highlighted_clusters.pdf") - os.remove("highlighted_clusters.pdf") \ No newline at end of file + os.remove("highlighted_clusters.pdf") diff --git a/cdlib/utils.py b/cdlib/utils.py index 9ca5a3a8..ae9b74a9 100644 --- a/cdlib/utils.py +++ b/cdlib/utils.py @@ -225,7 +225,7 @@ def nx_node_integer_mapping(graph: object) -> tuple: def remap_node_communities(communities: object, node_map: dict) -> list: - """Apply a map to the obtained communities to retreive the original node labels + """Apply a map to the obtained communities to retrive the original node labels :param communities: NodeClustering object :param node_map: dictionary diff --git a/cdlib/viz/__init__.py b/cdlib/viz/__init__.py index b25e7426..a781342c 100644 --- a/cdlib/viz/__init__.py +++ b/cdlib/viz/__init__.py @@ -1,2 +1,3 @@ from .networks import * from .plots import * +from .community_events import * diff --git a/cdlib/viz/community_events.py b/cdlib/viz/community_events.py new file mode 100644 index 00000000..c3bd0e51 --- /dev/null +++ b/cdlib/viz/community_events.py @@ -0,0 +1,503 @@ +import matplotlib.pyplot as plt +from cdlib import LifeCycle +import numpy as np +import pandas as pd +import plotly.graph_objects as go + +from cdlib.lifecycles import utils +from cdlib.lifecycles.algorithms.event_analysis import ( + analyze_flow, + event_weights_from_flow, + events_all, +) +from cdlib.lifecycles.algorithms.measures import event_typicality + +__all__ = [ + "plot_flow", + "plot_event_radar", + "plot_event_radars", + "typicality_distribution", +] + + +def _values_to_idx(links): # , all_labels): + df = links[["source", "target"]].copy() + all_labels = sorted(list(set(links["source"].tolist() + links["target"].tolist()))) + + df["source_ID"] = df["source"].apply(lambda x: all_labels.index(x)) + df["target_ID"] = df["target"].apply(lambda x: all_labels.index(x)) + df["value"] = links["value"] + return df + + +def _color_links(links, color): + res = [] + for _, row in links.iterrows(): + if row["source"] == row["target"]: + res.append("rgba(0,0,0,0.0)") + elif "X" in row["source"]: + res.append("rgba(0,0,0,0.02)") + else: + res.append(color) + return res + + +def _make_sankey(links, color, title, width=500, height=500, colors=None): + """ """ + links["color"] = _color_links(links, color=color) + all_labels = sorted(list(set(links["source"].tolist() + links["target"].tolist()))) + all_x = [int(l.split("_")[0]) for l in all_labels] + all_x = [(x - min(all_x)) / max(all_x) for x in all_x] + all_x = [x * 0.8 + 0.1 for x in all_x] + all_y = [0.5] * len(all_x) + + node_colors = [] + if isinstance(colors, list): + for l in all_labels: + if l in colors: + node_colors.append("green") + else: + node_colors.append("lightgrey") + + fig = go.Figure( + data=[ + go.Sankey( + arrangement="snap", + node=dict( + pad=10, + thickness=15, + line=dict(color="darkgray", width=0.2), + label=all_labels, + x=all_x, + y=all_y, + color=node_colors, + hovertemplate="Group size: %{value}", + ), + link=dict( + source=list( + (links["source_ID"]) + ), # indices correspond to labels, e.g. A1, A2, A1, B1, ... + target=list((links["target_ID"])), + value=list((links["value"])), + color=list((links["color"])), + label=list((links["value"])), + ), + ) + ] + ) + + # print(fig) + fig.update_layout( + font_size=10, + width=width, + height=height, + title={"text": title, "font": {"size": 25}}, # Set the font size here + ) + return fig + + +def _make_radar(values, categories, rescale, title="", color="green", ax=None): + pi = 3.14159 + # number of variables + N = len(categories) + + # What will be the angle of each axis in the plot? (we divide the plot / number of variable) + angles = [n / float(N) * 2 * pi for n in range(N)] + angles.append(angles[0]) # to close the line + values = values.copy() + values.append(values[0]) # to close the line + + # Initialise the spider plot + # ax = plt.subplot(4,4,row+1, polar=True, ) + if ax is None: + ax = plt.subplot( + 111, + polar=True, + ) + + # If you want the first axis to be on top: + ax.set_theta_offset(pi / 2) + ax.set_theta_direction(-1) + + # Draw one axe per variable + add labels labels yet + # plt.xticks(angles[:-1], categories, color='grey', size=10) + ax.set_xticks(angles[:-1], categories, color="blue", size=10) + # Draw ylabels + ax.set_rlabel_position(10) + ticks = list(np.linspace(0, 1, 5)) + + ax.set_rticks(ticks, [str(v) for v in ticks], color="grey", size=9) + ax.grid(True) + + plt.gcf().canvas.draw() + + angles_labels = np.rad2deg(angles) + angles_labels = [360 - a for a in angles_labels] + angles_labels = [180 + a if 90 < a < 270 else a for a in angles_labels] + labels = [] + for label, angle in zip(ax.get_xticklabels(), angles_labels): + x, y = label.get_position() + lab = ax.text( + x, + y + 0.05, + label.get_text(), + transform=label.get_transform(), + ha=label.get_ha(), + va=label.get_va(), + color="grey", + size=11, + fontdict={"variant": "small-caps"}, + ) + lab.set_rotation(angle) + labels.append(lab) + ax.set_xticklabels([]) + + ax.plot(angles, values, color=color, linewidth=1.5, linestyle="solid") + + ax.fill(angles, values, color="red", alpha=0.0) + if rescale: + ax.set_rmax(max(values) + 0.1) + else: + ax.set_rmax(1) + ax.set_rmin(0) + if title != "": + ax.set_title(title + "\n\n") + return ax + + +def plot_flow(lc: LifeCycle, node_focus: str = None, slice: tuple = None) -> go.Figure: + """ + Plot the flow of a lifecycle + + :param lc: the lifecycle object + :param node_focus: plot only the flows involving this group. Defaults to None. + :param slice: plot only a slice of the lifecycle. Defaults to all. + :return: a plotly figure + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from cdlib.viz import plot_flow + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> fig = plot_flow(events) + >>> fig.show() + """ + if lc.cm is not None: + lc = lc.cm + else: + raise ValueError("No temporal clustering set") + + if slice is not None: + lc = lc.slice(*slice) + links = [] + all_flows = lc.all_flows("+") + sum_out = 0 + group_size = {} + for name, flow in all_flows.items(): + nodes_group1 = lc.get_group(name) + group_size[name] = len(nodes_group1) + for name2, common in flow.items(): + if node_focus is not None: + nodes_group2 = lc.get_group(name2) + if node_focus not in nodes_group2 | nodes_group1: + continue + link = (name, name2, len(common)) + links.append(link) + sum_out += len(common) + + links_df = pd.DataFrame(links, columns=["source", "target", "value"]) + current_size_source = ( + links_df[["source", "value"]].groupby("source").sum().reset_index() + ) + current_size_target = ( + links_df[["target", "value"]].groupby("target").sum().reset_index() + ) + # join the two pd on group + current_size = current_size_source.merge( + current_size_target, + left_on="source", + right_on="target", + suffixes=("_source", "_target"), + how="outer", + ) + # add column taking the non-null among source and target + current_size["sourceTarget"] = current_size["source"].fillna(current_size["target"]) + current_size.fillna(0, inplace=True) + # add a column with the max of source and target + current_size["max"] = current_size[["value_source", "value_target"]].max(axis=1) + current_size.set_index("sourceTarget", inplace=True) + max_input_output = current_size.to_dict()["max"] + + # check the case of groups without a single link + for name in lc.groups_ids(): + if name not in max_input_output: + max_input_output[name] = 0 + + for name, size in max_input_output.items(): + if size < group_size[name]: # and (sum_out>0 or node_focus is not None): + fake_size = group_size[name] - size + links.append((name, name, fake_size)) + links_df = pd.DataFrame(links, columns=["source", "target", "value"]) + + # replace set_name by X_set_name + # all_labels = list(flow.keys()) + [set_name] + links_df = _values_to_idx(links_df) + + groups_containing_node = None + if node_focus is not None: + groups_containing_node = [ + name for name in all_flows.keys() if node_focus in lc.get_group(name) + ] + + # print(links) + return _make_sankey( + links_df, + color="lightblue", + title="Flow", + width=800, + height=800, + colors=groups_containing_node, + ) + + +def plot_event_radar( + lc: LifeCycle, + set_name: str, + direction: str, + min_branch_size: int = 1, + rescale: bool = True, + color: str = "green", + ax: object = None, +): + """ + Plot the radar of event weights for a given event set. + + :param lc: the lifecycle object + :param set_name: the event set name, e.g. "0_2" + :param direction: the direction of the event set, either "+" or "-" + :param min_branch_size: the minimum size of a branch to be considered, defaults to 1 + :param rescale: rescale the radar to the maximum value, defaults to True + :param color: the color of the radar, defaults to "green" + :param ax: the matplotlib axis, defaults to None + :return: the matplotlib axis + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from cdlib.viz import plot_flow + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> fig = plot_event_radar(events, "0_2", "+") + >>> fig.show() + + """ + if lc.cm is not None: + lc = lc.cm + else: + raise ValueError("No temporal clustering set") + + data = analyze_flow( + lc, set_name, direction=direction, min_branch_size=min_branch_size + ) + a = {set_name: data} + weights = event_weights_from_flow(a, direction=direction) + return _make_radar( + list(weights[set_name].values()), + list(weights[set_name].keys()), + rescale=rescale, + color=color, + ax=ax, + ) + + +def plot_event_radars( + lc: LifeCycle, set_name: str, min_branch_size: int = 1, colors: object = None +): + """ + Plot the radar of event weights for a given event set in both directions. + + :param lc: the lifecycle object + :param set_name: the event set name, e.g. "0_2" + :param min_branch_size: the minimum size of a branch to be considered, defaults to 1 + :param colors: the colors of the radar, defaults to None + :return: None + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from cdlib.viz import plot_flow + >>> import matplotlib.pyplot as plt + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> plot_event_radars(events, "0_2") + >>> plt.show() + + + """ + + if colors is None: + colors = ["green", "red"] + plot_event_radar( + lc, + set_name, + direction="-", + min_branch_size=min_branch_size, + color=colors[0], + ax=plt.subplot(121, polar=True), + ) + plot_event_radar( + lc, + set_name, + direction="+", + min_branch_size=min_branch_size, + color=colors[1], + ax=plt.subplot(122, polar=True), + ) + plt.tight_layout() + + +def typicality_distribution( + lc: LifeCycle, + direction: str, + width: int = 800, + height: int = 500, + showlegend: bool = True, +): + """ + Plot the distribution of typicality of events in a given direction. + + :param lc: the lifecycle object + :param direction: the direction of the events, either "+" or "-" + :param width: the width of the figure, defaults to 800 + :param height: the height of the figure, defaults to 500 + :param showlegend: show the legend, defaults to True + :return: a matplotlib figure + + :Example: + + >>> from cdlib import TemporalClustering, LifeCycle + >>> from cdlib import algorithms + >>> from cdlib.viz import plot_flow + >>> from networkx.generators.community import LFR_benchmark_graph + >>> tc = TemporalClustering() + >>> for t in range(0, 10): + >>> g = LFR_benchmark_graph( + >>> n=250, + >>> tau1=3, + >>> tau2=1.5, + >>> mu=0.1, + >>> average_degree=5, + >>> min_community=20, + >>> seed=10, + >>> ) + >>> coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + >>> tc.add_clustering(coms, t) + >>> events = LifeCycle(tc) + >>> events.compute_events("facets") + >>> fig = typicality_distribution(events, "+") + >>> fig.show() + + """ + if lc.cm is not None: + lc = lc.cm + else: + raise ValueError("No temporal clustering set") + + events = events_all(lc) + all_specificicities = [] + for group, event in events[direction].items(): + all_specificicities.append(event_typicality(event)) + df = pd.DataFrame(all_specificicities, columns=["event", "event_typicality"]) + # round to 1 decimal so that it works for the histogram + df["event_typicality"] = df["event_typicality"].apply(lambda x: round(x, 1)) + # replace 1 by 0.99 so that it is included in the last bin + df["event_typicality"] = df["event_typicality"].apply( + lambda x: 0.99 if x == 1 else x + ) + + fig = go.Figure() + for event in df["event"].unique(): + fig.add_trace( + go.Histogram( + x=df[df["event"] == event]["event_typicality"], + name=event, + opacity=0.75, + xbins=dict(start=0, end=1.1, size=0.1), + ) + ) + + possible_values = ( + utils.forward_event_names() + if direction == "+" + else utils.backward_event_names() + ) + + categories_present = df["event"].unique() + for category in possible_values: + if category not in categories_present: + fig.add_trace( + go.Histogram( + x=[None], + name=category, + opacity=0.75, + xbins=dict(start=0, end=1.1, size=0.1), + showlegend=True, + ) + ) # Empty histogram trace + for trace in fig.data: + trace.marker.color = utils.colormap()[trace.name] + + fig.update_layout(showlegend=showlegend) + fig.update_layout(barmode="stack") + + fig.update_xaxes(range=[0, 1.01], tickvals=np.arange(0, 1.01, 0.1)) + # set figure size + fig.update_layout(width=width, height=height, template="simple_white") + + return fig diff --git a/cdlib/viz/networks.py b/cdlib/viz/networks.py index 41bf60e9..1f9ccbbf 100644 --- a/cdlib/viz/networks.py +++ b/cdlib/viz/networks.py @@ -9,7 +9,11 @@ from community import community_louvain from typing import Union -__all__ = ["plot_network_clusters", "plot_network_highlighted_clusters", "plot_community_graph"] +__all__ = [ + "plot_network_clusters", + "plot_network_highlighted_clusters", + "plot_community_graph", +] # [r, b, g, c, m, y, k, 0.8, 0.2, 0.6, 0.4, 0.7, 0.3, 0.9, 0.1, 0.5] COLOR = ( @@ -234,7 +238,7 @@ def plot_network_highlighted_clusters( partition: NodeClustering, position: dict = None, figsize: tuple = (8, 8), - node_size: int = 200, # 200 default value + node_size: int = 200, # 200 default value plot_overlaps: bool = False, plot_labels: bool = False, cmap: object = None, @@ -273,19 +277,19 @@ def plot_network_highlighted_clusters( partition = __filter(partition.communities, top_k, min_size) graph = convert_graph_formats(graph, nx.Graph) - + # Assign weight of edge_weights_intracluster (default value is 200) or 1 to intra-community edges for community in partition: intra_community_edges = [(u, v) for u, v in graph.edges(community)] for edge in intra_community_edges: if all(node in community for node in edge): - graph[edge[0]][edge[1]]['weight'] = edge_weights_intracluster + graph[edge[0]][edge[1]]["weight"] = edge_weights_intracluster else: - graph[edge[0]][edge[1]]['weight'] = 1 - + graph[edge[0]][edge[1]]["weight"] = 1 + # Update node positions based on edge weights - position = nx.spring_layout(graph, weight='weight', pos=position) - + position = nx.spring_layout(graph, weight="weight", pos=position) + n_communities = len(partition) if n_communities == 0: warnings.warn("There are no communities that match the filter criteria.") @@ -316,13 +320,23 @@ def plot_network_highlighted_clusters( ) if isinstance(node_size, int): fig = nx.draw_networkx_nodes( - graph, position, node_size=node_size, node_color="w", nodelist=filtered_nodelist + graph, + position, + node_size=node_size, + node_color="w", + nodelist=filtered_nodelist, ) fig.set_edgecolor("k") - - filtered_edge_widths = [1] * len(filtered_edgelist) - - nx.draw_networkx_edges(graph, position, alpha=0.25, edgelist=filtered_edgelist, width=filtered_edge_widths) + + filtered_edge_widths = [1] * len(filtered_edgelist) + + nx.draw_networkx_edges( + graph, + position, + alpha=0.25, + edgelist=filtered_edgelist, + width=filtered_edge_widths, + ) if plot_labels: nx.draw_networkx_labels( @@ -331,13 +345,13 @@ def plot_network_highlighted_clusters( font_color=".8", labels={node: str(node) for node in filtered_nodelist}, ) - + for i in range(n_communities): if len(partition[i]) > 0: if plot_overlaps: - size = (n_communities - i) * node_size + size = (n_communities - i) * node_size else: - size = node_size + size = node_size fig = nx.draw_networkx_nodes( graph, position, @@ -346,24 +360,30 @@ def plot_network_highlighted_clusters( node_color=[cmap(_norm(i))], ) fig.set_edgecolor("k") - + # Plotting highlighted clusters for i, community in enumerate(partition): if len(community) > 0: # Extracting coordinates of community nodes x_values = [position[node][0] for node in community] y_values = [position[node][1] for node in community] - - min_x, max_x = min(x_values) , max(x_values) - min_y, max_y = min(y_values) , max(y_values) + + min_x, max_x = min(x_values), max(x_values) + min_y, max_y = min(y_values), max(y_values) # Create a polygon using the min and max coordinates - polygon = Polygon([(min_x, min_y), (max_x, min_y), (max_x, max_y), (min_x, max_y)], - edgecolor=cmap(_norm(i)), facecolor=cmap(_norm(i)), alpha=0.3) + polygon = Polygon( + [(min_x, min_y), (max_x, min_y), (max_x, max_y), (min_x, max_y)], + edgecolor=cmap(_norm(i)), + facecolor=cmap(_norm(i)), + alpha=0.3, + ) plt.gca().add_patch(polygon) - + # Extracting edges intra-community - intra_community_edges = [(u, v) for u, v in graph.edges() if u in community and v in community] + intra_community_edges = [ + (u, v) for u, v in graph.edges() if u in community and v in community + ] # Plot edges intra-community with the color of the community and increased width nx.draw_networkx_edges( @@ -468,6 +488,7 @@ def calculate_cluster_sizes(partition: NodeClustering) -> Union[int, dict]: else: return cluster_sizes # Elements have different values, return the dictionary + def plot_community_graph( graph: object, partition: NodeClustering, diff --git a/conda/meta.yaml b/conda/meta.yaml index 8cbe7910..6bad2910 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = "cdlib" %} -{% set version = "0.3.1" %} +{% set version = "0.4.0" %} package: name: "{{ name|lower }}" @@ -32,6 +32,7 @@ requirements: - thresholdclustering - python-Levenshtein - setuptools + - plotly build: - python - setuptools @@ -57,6 +58,7 @@ requirements: - dynetx - thresholdclustering - python-Levenshtein + - plotly about: home: "https://github.com/GiulioRossetti/cdlib" diff --git a/docs/conf.py b/docs/conf.py index 6bc1f766..13919412 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,10 +16,13 @@ import sys, os import sphinx_rtd_theme -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) -from cdlib import __version__ +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +try: + from cdlib import __version__ +except ImportError: + __version__ = "0.4.0" -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] version = __version__ @@ -27,19 +30,16 @@ release = version html_theme_options = { - 'collapse_navigation': False, - 'display_version': False, + "collapse_navigation": False, + "display_version": False, "navigation_depth": 3, } # -- Project information ----------------------------------------------------- -project = 'CDlib' -copyright = '2024, Giulio Rossetti' -author = 'Giulio Rossetti' - -# The full version, including alpha/beta/rc tags -release = '0.3.1' +project = "CDlib" +copyright = "2024, Giulio Rossetti" +author = "Giulio Rossetti" autodoc_mock_imports = [ "graph_tool.all", @@ -99,6 +99,8 @@ "scipy.stats", "clusim.sim", "clusim.clustering", + "plotly", + "plotly.graph_objects", ] # -- General configuration --------------------------------------------------- @@ -114,24 +116,23 @@ ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -#html_theme = 'alabaster' +# html_theme = 'alabaster' html_logo = "cdlib_new.png" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ["_static"] diff --git a/docs/installing.rst b/docs/installing.rst index 830497a3..facf516b 100644 --- a/docs/installing.rst +++ b/docs/installing.rst @@ -77,7 +77,7 @@ graph-tool ---------- ``CDlib`` integrates the support for SBM models offered by ``graph-tool``. -To install it, refer to the official `documentation `_ and install the conda-forge version of the package (or the deb version if in a *nix system). +To install it, refer to the official `documentation `_ and install the conda-forge version of the package (or the deb version if in a Unix system). ------ ASLPAw diff --git a/docs/reference/algorithms.rst b/docs/reference/algorithms.rst index 3d5a6d6d..4b6c21df 100644 --- a/docs/reference/algorithms.rst +++ b/docs/reference/algorithms.rst @@ -1,40 +1,27 @@ -============================== -Community Discovery algorithms -============================== +========================== +Static Community Discovery +========================== ``CDlib`` collects implementations of several Community Discovery algorithms. -To maintain the library organization as clean and resilient to changes as possible, the exposed algorithms are grouped following a simple rationale: - -1. Algorithms designed for static networks, and -2. Algorithms designed for dynamic networks. - -Moreover, within each category, ``CDlib`` groups together approaches sharing the same high-level characteristics. - -In particular, static algorithms are organized into: - -- Those searching for a *crisp* partition of the node-set; -- Those searching for an *overlapping* clustering of the node-set; -- Those that search for a *fuzzy* partition of the node-set; -- Those that cluster *edges*; -- Those that are designed to partition *bipartite* networks; -- Those that are designed to cluster *feature-rich* (node attributed) networks; -- Those that search for *antichains* in DAG (directed acyclic graphs). - -Dynamic algorithms, conversely, are organized to resemble the taxonomy proposed in [Rossetti18]_ - -- Instant Optimal, -- Temporal Trade-off - -This documentation follows the same rationale. +To maintain the library organization as clean and resilient to changes as possible, the exposed algorithms are grouped as: .. toctree:: :maxdepth: 1 cd_algorithms/node_clustering.rst cd_algorithms/edge_clustering.rst - cd_algorithms/temporal_clustering.rst +Moreover, node clustering algorithms are further divided to take into account the type of partition they search for: + +- *Crisp* partition (i.e., hard clustering) +- *Overlapping* clustering (i.e., a node can belong to multiple communities); +- *Fuzzy* partition (i.e., soft clustering); +- *Bipartite* clustering (i.e., clustering of bipartite networks). +- *Feature-rich* (node attributed) clustering (i.e., clustering of attributed networks leveraging both topology and node features). +- *Antichains* clustering in DAG (directed acyclic graphs). + +For each algorithm, the documentation provides a brief description, the list of parameters, and the reference to the original paper. ---------------- Ensemble Methods @@ -49,17 +36,3 @@ Learn how to (i) pool multiple algorithms on the same network, (ii) perform fitn :maxdepth: 1 ensemble.rst - -------- -Summary -------- - -If you need a summary of the available algorithms and their properties (accepted graph types, community characteristics, computational complexity), refer to: - -.. toctree:: - :maxdepth: 1 - - cd_algorithms/algorithms.rst - - -.. [Rossetti18] Rossetti, Giulio, and Rémy Cazabet. "Community discovery in dynamic networks: a survey." ACM Computing Surveys (CSUR) 51.2 (2018): 1-37. \ No newline at end of file diff --git a/docs/reference/benchmark.rst b/docs/reference/benchmark.rst index 7739e107..eb2bc309 100644 --- a/docs/reference/benchmark.rst +++ b/docs/reference/benchmark.rst @@ -22,7 +22,7 @@ All generators return a tuple: (``networkx.Graph``, ``cdlib.NodeClustering``) .. autosummary:: - :toctree: bench/ + :toctree: generated/ GRP LFR @@ -33,7 +33,7 @@ All generators return a tuple: (``networkx.Graph``, ``cdlib.NodeClustering``) Benchmarks for node-attributed static networks. .. autosummary:: - :toctree: bench/ + :toctree: generated/ XMark @@ -46,7 +46,7 @@ Time-evolving network topologies with planted community life cycles. All generators return a tuple: (``dynetx.DynGraph``, ``cdlib.TemporalClustering``) .. autosummary:: - :toctree: bench/ + :toctree: generated/ RDyn diff --git a/docs/reference/cd_algorithms/algorithms.rst b/docs/reference/cd_algorithms/algorithms.rst deleted file mode 100644 index 720202cb..00000000 --- a/docs/reference/cd_algorithms/algorithms.rst +++ /dev/null @@ -1,201 +0,0 @@ -================= -Algorithms' Table -================= - -The following table shows an up-to-date list of the Community Detection algorithms made available within ``cdlib``. - -Algorithms are listed in alphabetical order along with: - -- a few additional information on the graph typologies they handle, and -- the main expected characteristics of the clustering they produce, -- (when available) the theoretical computational complexity estimated by their authors. - -Apart from a few reported exceptions, all algorithms are assumed to work on undirected and unweighted graphs. - -**Complexity notation.** When discussing the time complexity, the following notation is assumed: - -- *n*: number of nodes -- *m*: number of edges -- *k*: number of iterations -- *c*: number of communities -- *d*: average node degree - -+--------------------------------+-------------------------------------------------------------+--------------------------------------------------+-----------------+ -| | Network | Communities | Complexity | -| Algorithm +-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| | Directed | Weighted | Bipartite | Feature-Rich | Temporal | Crisp | Overlaps | Nested | Fuzzy | Hierarchical | Time | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| agdl | x | x | | | | x | | | | | O(n^2) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| angel | | | | | | | x | | | | O(n) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| aslpaw | | | | | | | x | | | | O(kn) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| async_fluid | | | | | | x | | | | | O(m) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| belief | | | | | | x | | | | | O(kn) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| big_clam | | | | | | x | x | x | | | O(n) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| bimlpa | | | x | | | x | | | | | O(m) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| chinesewhispers | | x | | | | x | | | | | O(km) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| condor | | | x | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| conga | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| congo | | | | | | | x | | | | O(nm^2) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| core_expansion | | | | | | | x | | | | O(nlogn) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| cpm | | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| CPM_bipartite | | | x | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| coach | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| danmf | | x | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| dcs | | x | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| demon | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| der | | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| dpclus | | x | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| edmot | | x | x | | | x | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| ebgc | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| ego_networks | | | | | | | x | | | | O(m) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| egonet_splitter | | | | | | | x | | | | O(m^3/2 ) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| eigenvector | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| em | x | | | | | x | x | | x | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| endntm | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| eva | | | | x | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| frc_fgsn | | | x | | | | x | | x | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| ga | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| gdmp2 | x | | x | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| gemsec | x | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| girvan_newman | | | | | | x | | | | x | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| graph_entropy | | x | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| greedy_modularity | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| head_tail | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| hierarchical_link_communities | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| ilouvain | | | | x | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| infomap | x | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| infomap_bipartite | x | x | x | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| ipca | | x | | | | x | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| kclique | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| kcut | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| label_propagation | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lais2 | | | | | | | x | | | | O(cm + n) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| leiden | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lemon | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lfm | | | | | | | x | | | x | O(n^2 logn) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| louvain | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lpam | | | | | | | x | | | | O(2^m) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lpanni | | | | | | | x | | | | O(n) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lswl | | x | | | | x | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| lswl_plus | | x | | | | x | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| markov_clustering | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| mcode | | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| mnmf | | | | | | | x | | | | O(n^2*m+n^2*k) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| mod_m | | | | | | x | | | | | O(nd) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| mod_r | | | | | | x | | | | | O(nd) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| node_perception | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| multicom | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| nnsed | | | | | | | x | | | | O(kn^2) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| overlapping_seed_set_expansion | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| paris | | x | | | | x | | | | x | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| percomvc | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| principled_clustering | | | | | | | x | | x | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| pycombo | | x | | | | x | | | | | O(n^2 logc) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| rb_pots | x | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| rber_pots | | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| ricci_community | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| r_spectral_clustering | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| sbm_dl | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| sbm_dl_nested | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| scan | | | | | | x | | | | | O(m) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| scd | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| spectral | | | x | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| significance_communities | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| sibilarity_antichain | x (DAG) | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| slpa | | | | | | | x | | | | O(kn) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| spinglass | | | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| surprise_communities | x | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| symmnmf | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| threshold_clustering | x | x | | | | x | | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| tiles | | | | | x | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| umstmo | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| walkscan | | | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| walktrap | | | | | | x | | | | | O(n^2 logn) | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ -| wCommunity | | x | | | | | x | | | | | -+--------------------------------+-----------+----------+-----------+--------------+-----------+-------+----------+--------+-------+--------------+-----------------+ \ No newline at end of file diff --git a/docs/reference/cd_algorithms/edge_clustering.rst b/docs/reference/cd_algorithms/edge_clustering.rst index e2833a74..3b0765e1 100644 --- a/docs/reference/cd_algorithms/edge_clustering.rst +++ b/docs/reference/cd_algorithms/edge_clustering.rst @@ -13,7 +13,7 @@ They return as result a ``EdgeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ hierarchical_link_community diff --git a/docs/reference/cd_algorithms/node_clustering.rst b/docs/reference/cd_algorithms/node_clustering.rst index 306260d0..8b72d916 100644 --- a/docs/reference/cd_algorithms/node_clustering.rst +++ b/docs/reference/cd_algorithms/node_clustering.rst @@ -1,17 +1,12 @@ -========================== -Static Community Discovery -========================== - ---------------- +=============== Node Clustering ---------------- +=============== Algorithms falling in this category generate communities composed of nodes. The communities can represent neat, *crisp*, partitions and *overlapping* or even *fuzzy* ones. .. note:: The following lists are aligned to CD methods available in the *GitHub main branch* of `CDlib`_. - In particular, the following algorithms are not yet released in the packaged version of the library: coach, mcode, ipca, dpclus, graph_entropy, ebgc, r_spectral_clustering. .. automodule:: cdlib.algorithms @@ -26,7 +21,7 @@ As a result, methods in this subclass return a ``NodeClustering`` object instanc .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ agdl async_fluid @@ -77,7 +72,7 @@ A clustering is said to be *overlapping* if any generic node can be assigned to As a result, methods in this subclass return a ``NodeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ aslpaw angel @@ -117,7 +112,7 @@ A clustering is *fuzzy* if each node can belong (with a different degree of like As a result, methods in this subclass return a ``FuzzyNodeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ frc_fgsn principled_clustering @@ -130,7 +125,7 @@ Node Attribute As a result, methods in this subclass return a ``AttrNodeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ eva ilouvain @@ -143,7 +138,7 @@ Bipartite Graph Communities As a result, methods in this subclass return a ``BiNodeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ bimlpa condor @@ -159,7 +154,7 @@ Antichain Communities Methods in this subclass are designed to extract communities from Directed Acyclic Graphs (DAG) and return. As a result, a ``NodeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ siblinarity_antichain @@ -172,7 +167,7 @@ Algorithms falling in this category generate communities composed of edges. They return, as a result, a ``EdgeClustering`` object instance. .. autosummary:: - :toctree: algs/ + :toctree: ../generated/ hierarchical_link_community diff --git a/docs/reference/classes.rst b/docs/reference/classes.rst index 52203169..49b2afdc 100644 --- a/docs/reference/classes.rst +++ b/docs/reference/classes.rst @@ -27,10 +27,7 @@ Refer to the following documentation for a complete overview of the methods expo classes/edge_clustering.rst classes/temporal_clustering.rst - ------------------------------------------------- -Using Clustering objects with your algorithm ------------------------------------------------- +.. note:: I have a clustering obtained by an algorithm not included in ``CDlib``. Can I load it in a Clustering object to leverage your library's evaluation and visualization facilities? diff --git a/docs/reference/classes/lifecycle.rst b/docs/reference/classes/lifecycle.rst new file mode 100644 index 00000000..6e399a5b --- /dev/null +++ b/docs/reference/classes/lifecycle.rst @@ -0,0 +1,12 @@ +================ +LifeCycle Object +================ + +The LifeCycle object is a class that represents the life cycle of temporal communities extracted from a dynamic network. +It is used to store the information about the in/out flows of nodes between communities and the from/to events they generate. + +.. currentmodule:: cdlib +.. autoclass:: LifeCycle + :members: + :inherited-members: + diff --git a/docs/reference/classes/node_clustering.rst b/docs/reference/classes/node_clustering.rst index bda6f8e8..855a8f6c 100644 --- a/docs/reference/classes/node_clustering.rst +++ b/docs/reference/classes/node_clustering.rst @@ -70,4 +70,21 @@ Comparing Node Clusterings NodeClustering.adjusted_rand_index NodeClustering.adjusted_mutual_information NodeClustering.variation_of_information + NodeClustering.partition_closeness_simple + NodeClustering.ecs + NodeClustering.jaccard_index + NodeClustering.rand_index + NodeClustering.fowlkes_mallows_index + NodeClustering.classification_error + NodeClustering.czekanowski_index + NodeClustering.dice_index + NodeClustering.sorensen_index + NodeClustering.rogers_tanimoto_index + NodeClustering.southwood_index + NodeClustering.mi + NodeClustering.rmi + NodeClustering.geometric_accuracy + NodeClustering.overlap_quality + NodeClustering.sample_expected_sim + diff --git a/docs/reference/classes/temporal_clustering.rst b/docs/reference/classes/temporal_clustering.rst index 537403db..d9bc1a6b 100644 --- a/docs/reference/classes/temporal_clustering.rst +++ b/docs/reference/classes/temporal_clustering.rst @@ -38,11 +38,4 @@ Evaluating Node Clustering TemporalClustering.clustering_stability_trend -Matching temporal clustering ----------------------------- - -.. autosummary:: - - TemporalClustering.community_matching - TemporalClustering.lifecycle_polytree diff --git a/docs/reference/evaluation.rst b/docs/reference/evaluation.rst index 3fe7f235..c8e60d5c 100644 --- a/docs/reference/evaluation.rst +++ b/docs/reference/evaluation.rst @@ -25,7 +25,7 @@ Fitness functions allow to summarize the characteristics of a computed set of co .. automodule:: cdlib.evaluation .. autosummary:: - :toctree: eval/ + :toctree: generated/ avg_distance avg_embeddedness @@ -53,7 +53,7 @@ Fitness functions allow to summarize the characteristics of a computed set of co Among the fitness function, a well-defined family of measures is the Modularity-based one: .. autosummary:: - :toctree: eval/ + :toctree: generated/ erdos_renyi_modularity link_modularity @@ -66,7 +66,7 @@ Among the fitness function, a well-defined family of measures is the Modularity- Some measures will return an instance of ``FitnessResult`` that takes together min/max/mean/std values of the computed index. .. autosummary:: - :toctree: eval/ + :toctree: generated/ FitnessResult @@ -78,7 +78,7 @@ It is often useful to compare different graph partitions to assess their resembl ``cdlib`` implements the following partition comparisons scores: .. autosummary:: - :toctree: eval/ + :toctree: generated/ adjusted_mutual_information mi @@ -110,7 +110,7 @@ It is often useful to compare different graph partitions to assess their resembl Some measures will return an instance of ``MatchingResult`` that takes together the computed index's mean and standard deviation values. .. autosummary:: - :toctree: eval/ + :toctree: generated/ MatchingResult @@ -153,19 +153,6 @@ All details on remote datasets can be found on the dedicated page. datasets.rst -^^^^^^^^^^^^^^^^^^ -Ranking Algorithms -^^^^^^^^^^^^^^^^^^ - -Once a set of alternative clusterings have been extracted from a given network, is there a way to select the *best* one given a set of target fitness functions? - -``cdlib`` exposes a few standard techniques to address such an issue: all details can be found on the dedicated documentation page. - -.. toctree:: - :maxdepth: 1 - - validation.rst - .. _`cdlib`: https://github.com/GiulioRossetti/cdlib .. [Peel17] Peel, Leto, Daniel B. Larremore, and Aaron Clauset. "The ground truth about metadata and community detection in networks." Science Advances 3.5 (2017): e1602548. \ No newline at end of file diff --git a/docs/reference/events.rst b/docs/reference/events.rst new file mode 100644 index 00000000..9df3e4d1 --- /dev/null +++ b/docs/reference/events.rst @@ -0,0 +1,332 @@ +============================== +Community Events and LifeCycle +============================== + +Community events describe the changes in the community structure of a network over time. +The community structure of a network can change due to the arrival or departure of nodes, the creation or dissolution of communities, or the merging or splitting of communities. + +The ``cdlib`` library provides a set of tools to analyze the evolution of communities over time, including the detection of community events and the analysis of community life cycles. + +The interface of the library is designed to be as simple as possible, allowing users to easily analyze the evolution of communities in their networks. + +Check the ``LifeCycle`` class for more details: + +.. toctree:: + :maxdepth: 1 + + classes/lifecycle.rst + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Clustering with Explicit LifeCycle +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some dynamic community detection algorithms (e.g., Temporal trade-off ones) provide an explicit representation of the life cycle of communities. +In this case it is not necessary to detect community events as post-processing, as the life cycle of communities is already available. + +To analyze such pre-computed events apply the following snippet: + +.. code-block:: python + + from cdlib import LifeCycle + from cdlib import algorithms + import dynetx as dn + + dg = dn.DynGraph() + for x in range(10): + g = nx.erdos_renyi_graph(200, 0.05) + dg.add_interactions_from(list(g.edges()), t=x) + coms = algorithms.tiles(dg, 2) + + lc = LifeCycle(coms) + lc.compute_events_from_explicit_matching() + + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Clustering without Explicit LifeCycle +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In case the dynamic community detection algorithm does not provide an explicit representation of the life cycle of communities, the library provides a set of tools to detect community events and analyze the life cycle of communities. +In particular, the library allows to identify events following four different strategies: + +- **Facets** events definition [Failla24]_ +- **Greene** events definition [Greene2010]_ +- **Asur** events definition [Asur2009]_ +- **Custom** events definition + +The first three strategies are based on the definition of community events proposed in the literature, while the last one allows users to define their own events. + +To apply one of the first three strategies, use the following snippet: + +.. code-block:: python + + from cdlib import LifeCycle + from networkx.generators.community import LFR_benchmark_graph + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") # or "greene" or "asur" + +.. note:: + Each strategy has its parameters that can be specified passing a dictionary to the compute_events method. + In particular, the ``facets`` strategy requires the specification of the ``min_branch_size`` parameter (default 1), while ``greene`` and ``asur`` require the specification of the ``threshold`` parameter (default 0.1). + + +To define custom events, use the following snippet: + +.. code-block:: python + + from cdlib import LifeCycle + from networkx.generators.community import LFR_benchmark_graph + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + jaccard = lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)) + events.compute_events_with_custom_matching(jaccard, threshold=0.3, two_sided=True) + +In the above snippet, the ``jaccard`` function is used to define the similarity between two communities. +The ``threshold`` parameter is used to define the minimum similarity required to consider two communities one an evolution of the other. +Changing the similarity function and the threshold allows users to define their own matching strategies. + +^^^^^^^^^^^^^^^^^^^^^^^^^^ +Analyzing Events and Flows +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once the community events have been detected, the library provides a set of tools to analyze them. +Each event is characterized by a set of properties, such as the type of event, the communities involved, the nodes involved, and the time of occurrence. + +.. note:: + + The library assigns a unique identifier to each community of the form ``t_c`` where ``t`` is the time of occurrence and ``c`` is the community identifier. + E.g., the community with identifier ``2_3`` is the community with identifier ``3`` at time ``2``. + +Each tracking strategy defines a different set of events (e.g., creation, dissolution, merging, splitting). +However, ``cdlib`` generalize the concept of event breaking it down into four components. For each generic temporal community ``t_c`` it provide access to: + +- **In flow**: the set of nodes that have entered the community ``t_c`` from clusters of time ``t-1``; +- **Out flow**: the set of nodes that will leave the community ``t_c`` at time ``t+1``; +- **From Events**: the set of events that generate the community observed at ``t`` tha involved clusters at time ``t-1``; +- **To Events**: the set of events community ``t_c`` starts at time ``t`` that will affect clusters at time ``t+1``; + +All these information can be summarized in a community temporal-dependency digraph called ``polytree``. + +Here an example of how to analyze community events and flows: + +.. code-block:: python + + from cdlib import LifeCycle + from networkx.generators.community import LFR_benchmark_graph + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") # or "greene" or "asur" + event_types = events.get_event_types() # provide the list of available events for the detected method (in this case for 'facets') + + ev = events.get_event("1_2") # to compute events for all communities use the get_events() method + print(ev.out_flow) # to get the out flow of the community 1_2 + print(ev.in_flow) # to get the in flow of the community 1_2 + print(ev.from_event) # to get the from events of the community 1_2 + print(ev.to_event) # to get the to events of the community 1_2 + + out_flow = events.analyze_flow("1_2", "+") # if the community id is not specified all the communities are considered + in_flow = events.analyze_flow("1_2", "-") + +Each event is characterized by its degree of importance for the actual status of the community. +In particular, ``facets`` events are fuzzy events (more than one can occur at the same time) while ``greene`` and ``asur`` events are crisp events (only one can occur at the same time). + +.. note:: + Following the ``facets`` terminology, ``analyze_flow`` and ``analyze_flows`` returns a dictionary describing the flow in terms of its Unicity, Identity and Outflow. + For a detailed description of such measures refer to [Failla24]_ + +In addition, if the temporal network comes with attributes associated to the nodes (either dynamically changing or not - i.e., political leanings), the library provides a set of tools to analyze the typicality of the events. + +Setting and retreiving node attributes is straightforward: + +.. code-block:: python + + from cdlib import LifeCycle + from networkx.generators.community import LFR_benchmark_graph + + def random_leaning(): + attrs = {} + for i in range(250): # 250 nodes + attrs[i] = {} + for t in range(10): # 10 time steps + attrs[i][t] = random.choice(["left", "right"]) + return attrs + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") # or "greene" or "asur" + events.set_attribute(random_leaning(), "political_leaning") + attrs = events.get_attribute("political_leaning") + + events.analyze_flow("1_1", "+", attr="political_leaning") # to analyze the flow of political leaning in the community 1_1 + +Attributes are stored as a dictionary of dictionaries where the first key is the node id and the second key is the time step. + +If such information is available, the ``analyze_flow`` method will integrate in its analysis an evaluation of flow-attribute entropy. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Visualizing Events and Flows +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The library provides a set of tools to visualize the events and flows detected in the community structure of a network. + +.. note:: + + The library uses the ``networkx`` library to represent the community structure of a network and the ``matplotlib`` / ``plotly`` library to visualize it. + +Here an example of how to visualize community events, flows and polytree: + +.. code-block:: python + + from cdlib import LifeCycle + from cdlib.viz import ( + plot_flow, + plot_event_radar, + plot_event_radars, + typicality_distribution, + ) + from networkx.generators.community import LFR_benchmark_graph + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") # or "greene" or "asur" + + fig = plot_flow(events) + fig.show() + + fig = plot_event_radar(events, "1_2", direction="+") # only out events + fig.show() + + fig = plot_event_radars(events, "1_2") # both in and out events + fig.show() + + fig = typicality_distribution(events, "+") + fig.show() + + dg = events.polytree() + fig = nx.draw_networkx(dg, with_labels=True) + fig.show() + +For a detailed description of the available methods and parameters, check the ``Visual Analytics`` section of the ``cdlib`` reference guide. + +^^^^^^^^^^^^^^^^ +Validating Flows +^^^^^^^^^^^^^^^^ + +The library provides a set of tools to statistically validate the observed flows against null models. + +Here an example of how to validate the observed flows: + +.. code-block:: python + + from cdlib import LifeCycle + from cdlib.lifecycles.validation import validate_flow, validate_all_flows + from networkx.generators.community import LFR_benchmark_graph + + tc = TemporalClustering() + for t in range(0, 10): + g = LFR_benchmark_graph( + n=250, + tau1=3, + tau2=1.5, + mu=0.1, + average_degree=5, + min_community=20, + seed=10, + ) + coms = algorithms.louvain(g) # here any CDlib algorithm can be applied + tc.add_clustering(coms, t) + + events = LifeCycle(tc) + events.compute_events("facets") # or "greene" or "asur" + + cf = events.flow_null("1_2", "+", iterations=1000) # validate the out flow of community 1_2. Iterations define the number of randomizations to perform. + vf = events.all_flows_null("+", iterations=1000) # validate all out flows + +Both validation methods return a dictionary keyed by set identifier and valued by mean, std, and p-value of the observed flow against the null model. + +.. automodule:: cdlib.lifecycles + :members: + :undoc-members: + +.. autosummary:: + :toctree: generated/ + + flow_null + all_flows_null + + +.. [Failla24] Andrea Failla, Rémy Cazabet, Giulio Rossetti, Salvatore Citraro . "Redefining Event Types and Group Evolution in Temporal Data.", arXiv preprint arXiv:2403.06771. 2024 + +.. [Asur2009] Sitaram Asur, Parthasarathy Srinivasan, Ucar Duygu. "An event-based framework for characterizing the evolutionary behavior of interaction graphs." ACM Transactions on Knowledge Discovery from Data (TKDD) 3.4 (2009): 1-36. + +.. [Greene2010] Derek Greene, Doyle Donal, Cunningham, Padraig. "Tracking the evolution of communities in dynamic social networks." 2010 international conference on advances in social networks analysis and mining. IEEE, 2010. \ No newline at end of file diff --git a/docs/reference/readwrite.rst b/docs/reference/readwrite.rst index fb2463b7..06a137c8 100644 --- a/docs/reference/readwrite.rst +++ b/docs/reference/readwrite.rst @@ -2,7 +2,11 @@ Input-Output ************ -Functions to save/load ``cdlib`` communities to/from file. +Functions to save/load ``cdlib`` communities and events to/from file. + +^^^^^^^^^^^^^ +Community I/O +^^^^^^^^^^^^^ ---------- CSV format @@ -32,6 +36,22 @@ JSON format allows the storage/loading of community discovery algorithm results :toctree: generated/ read_community_json + read_community_from_json_string write_community_json -.. note:: JSON formatting allows only saving/retrieving all kinds of Clustering objects and maintaining all their metadata - except for the graph object instance. \ No newline at end of file +.. note:: JSON formatting allows only saving/retrieving all kinds of Clustering objects and maintaining all their metadata - except for the graph object instance. + +^^^^^^^^^^^^^^^^^^^^ +Community Events I/O +^^^^^^^^^^^^^^^^^^^^ + +Events are a fundamental concept in the context of dynamic community discovery. The following methods allow you to read/write events to/from CSV. + +.. autosummary:: + :toctree: generated/ + + read_lifecycle_json + write_lifecycle_json + + + diff --git a/docs/reference/reference.rst b/docs/reference/reference.rst index 1f2889ed..3c6ccf36 100644 --- a/docs/reference/reference.rst +++ b/docs/reference/reference.rst @@ -1,8 +1,8 @@ ********* -Reference +API Guide ********* -``cdlib``comprises several modules, each fulfilling a different task related to community detection. +``cdlib`` comprises several modules, each fulfilling a different task related to community detection. .. toctree:: @@ -10,7 +10,10 @@ Reference classes.rst algorithms.rst + temporal_clustering.rst + events.rst evaluation.rst + validation.rst viz.rst readwrite.rst utils.rst \ No newline at end of file diff --git a/docs/reference/cd_algorithms/temporal_clustering.rst b/docs/reference/temporal_clustering.rst similarity index 83% rename from docs/reference/cd_algorithms/temporal_clustering.rst rename to docs/reference/temporal_clustering.rst index f6a5f89e..fe02acf4 100644 --- a/docs/reference/cd_algorithms/temporal_clustering.rst +++ b/docs/reference/temporal_clustering.rst @@ -4,6 +4,13 @@ Dynamic Community Discovery Algorithms falling in this category generate communities that evolve as time goes by. +Dynamic algorithms are organized to resemble the taxonomy proposed in [Rossetti18]_ + +- Instant Optimal, +- Temporal Trade-off + +For all details on the available methods to extract and manipulate dynamic communities, please refer to the ``TemporalClustering`` documentation. + .. automodule:: cdlib.algorithms @@ -34,14 +41,7 @@ Here is an example of a two-step built on top of Louvain partitions of a dynamic coms = algorithms.louvain(g) # here any CDlib algorithm can be applied tc.add_clustering(coms, t) -For what concerns the second stage (snapshots' node clustering matching), it is possible to parametrize the set similarity function as follows (example made with a standard Jaccard similarity): - -.. code-block:: python - - jaccard = lambda x, y: len(set(x) & set(y)) / len(set(x) | set(y)) - matches = tc.community_matching(jaccard, two_sided=True) - -For all details on the available methods to extract and manipulate dynamic communities, please refer to the ``TemporalClustering`` documentation. +For what concerns the second stage (snapshots' node clustering matching), refer to the ``Community Events and LifeCycle`` section of the ``cdlib`` documentation. ^^^^^^^^^^^^^^^^^^ Temporal Trade-Off @@ -54,7 +54,13 @@ Dynamic Community Discovery algorithms falling into this category can be describ - Initialization: find communities for the initial state of the network; - Update: find communities at step t using graph at t and past information for each incoming step. +Currently ``cdlib`` features the following Temporal Trade-off algorithms: + .. autosummary:: - :toctree: algs/ + :toctree: generated/ tiles + + + +.. [Rossetti18] Rossetti, Giulio, and Rémy Cazabet. "Community discovery in dynamic networks: a survey." ACM Computing Surveys (CSUR) 51.2 (2018): 1-37. \ No newline at end of file diff --git a/docs/reference/validation.rst b/docs/reference/validation.rst index 9a6c407c..608ce8c0 100644 --- a/docs/reference/validation.rst +++ b/docs/reference/validation.rst @@ -1,6 +1,11 @@ -****************** -Ranking Algorithms -****************** +******************************* +Validate CD Algorithms Rankings +******************************* + +.. note:: + + Once a set of alternative clusterings have been extracted from a given network, is there a way to select the *best* one given a set of target fitness functions? + Let us assume that you ran a set **X** of community discovery algorithms on a given graph **G** and that you computed a set **Y** of fitness scores for each of the obtained clustering. diff --git a/docs/reference/viz.rst b/docs/reference/viz.rst index ed01665c..0d88fa05 100644 --- a/docs/reference/viz.rst +++ b/docs/reference/viz.rst @@ -36,4 +36,19 @@ Community evaluation outputs can be easily used to represent the main partition plot_sim_matrix plot_com_stat plot_com_properties_relation - plot_scoring \ No newline at end of file + plot_scoring + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Dynamic Community Events plots +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Dynamic community detection algorithms can be evaluated using the dynamic community events framework. The results can be visualized using the following functions. + +.. autosummary:: + :toctree: generated/ + + plot_flow + plot_event_radar + plot_event_radars + typicality_distribution \ No newline at end of file diff --git a/environment.yml b/environment.yml index 725db7f1..20ada6e4 100644 --- a/environment.yml +++ b/environment.yml @@ -21,3 +21,4 @@ dependencies: - dynetx - thresholdclustering - python-Levenshtein +- plotly diff --git a/requirements.txt b/requirements.txt index 345f9a4c..55482c09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,19 @@ numpy scikit-learn tqdm -networkx >= 3.0 +networkx>=3.0 demon -python-louvain >= 0.16 -scipy >= 1.10 +python-louvain>=0.16 +scipy>=1.10 pulp seaborn pandas eva_lcd bimlpa -python-igraph >= 0.10 +python-igraph>=0.10 angelcommunity pooch dynetx thresholdclustering -python-Levenshtein \ No newline at end of file +python-Levenshtein +plotly