Skip to content

Commit

Permalink
Merge pull request #240 from seoanezonjic/master
Browse files Browse the repository at this point in the history
HLC implementation
  • Loading branch information
GiulioRossetti authored May 29, 2024
2 parents 72772ec + aac3043 commit 68ed449
Show file tree
Hide file tree
Showing 4 changed files with 303 additions and 4 deletions.
95 changes: 92 additions & 3 deletions cdlib/algorithms/edge_clustering.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from cdlib import EdgeClustering
from collections import defaultdict
import networkx as nx
from cdlib.utils import convert_graph_formats
from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted
from cdlib.utils import convert_graph_formats, nx_node_integer_mapping, remap_edge_communities
from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted, HLC_read_edge_list_weighted, HLC_full

__all__ = ["hierarchical_link_community"]
__all__ = ["hierarchical_link_community", "hierarchical_link_community_w", "hierarchical_link_community_full"]


def hierarchical_link_community(g_original: object) -> EdgeClustering:
Expand Down Expand Up @@ -50,3 +50,92 @@ def hierarchical_link_community(g_original: object) -> EdgeClustering:

coms = [list(c) for c in coms.values()]
return EdgeClustering(coms, g_original, "HLC", method_parameters={})


def hierarchical_link_community_w(g_original: object) -> EdgeClustering:
"""
HLC (hierarchical link clustering) is a method to classify links into topologically related groups.
The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities.
At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut.
**Supported Graph Types**
========== ======== ========
Undirected Directed Weighted
========== ======== ========
Yes No Yes
========== ======== ========
:param g_original: a networkx/igraph object
:return: EdgeClustering object
:Example:
>>> from cdlib import algorithms
>>> import networkx as nx
>>> G = nx.karate_club_graph()
>>> com = algorithms.hierarchical_link_community_w(G)
:References:
Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. <https://www.nature.com/articles/nature09182/>`_ nature 466.7307 (2010): 761.
"""

g = convert_graph_formats(g_original, nx.Graph)

adj, edges, ij2wij = HLC_read_edge_list_weighted(g)
edge2cid, _, _, _ = HLC(adj, edges).single_linkage(w=ij2wij)

coms = defaultdict(list)
for e, com in edge2cid.items():
coms[com].append(e)

coms = [list(c) for c in coms.values()]
return EdgeClustering(coms, g_original, "HLC_w", method_parameters={})


"""
HLC (hierarchical link clustering) is a method to classify links into topologically related groups.
The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities.
At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut.
This implementation follows exactly the algorithm described in Ahn et al and uses numpy/scipy to improve the clustering computation (It is faster and consumes less memory.
**Supported Graph Types**
========== ======== ========
Undirected Directed Weighted
========== ======== ========
Yes No Yes
========== ======== ========
:param g_original: a networkx/igraph object
:weight: None for unweighted networks, jaccard approximation is used. When defined with a string, edge attribute name (usually 'weight') to be used as weight and Tanimoto approximation is used.
:simthr: None by default. If set to float, all values less than threshold are set to 0 in similarity matrix (it could reduce memory usage).
:hcmethod: Linkage method used in hierarchical clustering, 'single' by default. See scipy.cluster.hierarchy.linkage to get full method list.
:min_edges: None by default. If set to float, minimum number of edges that a community must contain to be kept in the clustering
:verbose: If True, write intermediary steps to disk.
:return: EdgeClustering object
:Example:
>>> from cdlib import algorithms
>>> import networkx as nx
>>> G = nx.karate_club_graph()
>>> com = algorithms.hierarchical_link_community_full(G)
:References:
Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. <https://www.nature.com/articles/nature09182/>`_ nature 466.7307 (2010): 761.
"""
def hierarchical_link_community_full(g_original: object, weight='weight', simthr=None, hcmethod='single', min_edges= None, verbose=False) -> EdgeClustering:
g = convert_graph_formats(g_original, nx.Graph)
g_number, dictio = nx_node_integer_mapping(g)

coms = HLC_full(g_number, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio= dictio).clusters
clustering = EdgeClustering(coms, g_number, "HLC_f", method_parameters={})
if dictio != None: clustering.communities = remap_edge_communities(clustering.communities, dictio)
return clustering
188 changes: 187 additions & 1 deletion cdlib/algorithms/internal/HLC.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from heapq import heappush, heappop
from itertools import combinations, chain
from collections import defaultdict

import networkx as nx
import numpy as np
from scipy.cluster.hierarchy import linkage

# Hierarchical Link Community
"""
Expand Down Expand Up @@ -220,3 +222,187 @@ def cal_jaccard(intersect_val, left_val, right_val):
similarity_ratio = cal_jaccard(ai_dot_aj, n2a_sqrd[i], n2a_sqrd[j])
heappush(min_heap, (1 - similarity_ratio, edge_pair))
return [heappop(min_heap) for _ in range(len(min_heap))]

class HLC_full(object):
def __init__(self, net, weight='weight', simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None ):
self.edge_counter = 0
self.clusters = self.edge_clustering(net, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio = dictio)

def edge_clustering(self, net, weight=None, simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None):
condensed_dist_vector, edge_matrix_len, edge_list = self.get_edge_similarity_matrix(net, weight=weight, simthr=simthr, verbose=verbose, dictio=dictio) # TODO: Add jaccard matrix without use weigths
clustering = linkage(condensed_dist_vector, hcmethod)
final_clusters = self.get_clusters_by_partition_density(clustering, edge_matrix_len, edge_list, min_edges=min_edges)
return final_clusters

def get_edge_similarity_matrix(self, net, weight=None, simthr=None, verbose=False, dictio=None):
node_list = list(net.nodes())
node_list.sort()
adj = nx.adjacency_matrix(net, weight=weight, nodelist=node_list)
adj = adj.toarray() # This line is needed as a change in csr format from matrix to array sparse. Diference in dot product if this line is removed!

if weight == None:
degree = np.sum(adj, axis=1)
np.fill_diagonal(adj, 1)
else:
degree = adj > 0
degree = np.sum(degree, axis=1)
weigth_sum = np.sum(adj, axis=1)
np.fill_diagonal(adj, weigth_sum/degree) # Ahn ecuation 4 in supplementary file

dotproduct = np.dot(adj, adj) # This efficiently calculates the vector products needed for tanimoto coefficient (ai and aj)
adj = None # Remove matrix in order to save memory
edge_dict = {}
data = []; col_i = []; col_j = [] # To save tanimoto similarities as sparse data
cache = {} # To save tanimoto similarity by pairs and avoid caclulate it when a pair is repeated
# k_node, i_neigh, j_neigh they are adj matrix indexes AND node names
edge_similarities = []
for k_node in node_list: # take node as reference and calculate tanimoto coeff for each pair of edges
neighbor_list = list(net.neighbors(k_node)) # take neighbors to get pairs and compare edges
neighbor_list.sort()
if len(neighbor_list) < 2: continue # Skip k nodes that has NOT enough neighbours to perform tanimoto
while len(neighbor_list) > 0:
i_neigh = neighbor_list.pop()
for j_neigh in neighbor_list:
if weight != None:
sim = self.get_tanimoto_index(i_neigh, j_neigh, dotproduct, cache)
else:
sim = self.get_jaccard_index(i_neigh, j_neigh, dotproduct, degree, cache)
if simthr != None and sim < simthr: continue
pair = [self.get_edge_id(k_node, i_neigh, edge_dict), self.get_edge_id(k_node, j_neigh, edge_dict)]
pair.sort()
ki_edge_id, kj_edge_id = pair
data.append(sim)
if verbose:
a_pair = '_'.join(sorted([dictio[k_node], dictio[i_neigh]]))
b_pair = '_'.join(sorted([dictio[k_node], dictio[j_neigh]]))
ids = sorted([a_pair, b_pair])
edge_similarities.append(ids + [str(sim)])
col_i.append(ki_edge_id)
col_j.append(kj_edge_id)

if verbose:
with open('edge_scores.txt', 'w') as f:
for e in edge_similarities: f.write("\t".join(e) + "\n")
condensed_dist_vector, edge_matrix_len = self.get_distance_condensed_vector(data, col_i, col_j)
return condensed_dist_vector, edge_matrix_len, list(edge_dict.keys())

def get_tanimoto_index(self, i_neigh, j_neigh, dotproduct, cache):
sim, pair = self.get_sim(i_neigh, j_neigh, cache)
if sim == None:
a_i2 = dotproduct[i_neigh,i_neigh]
a_j2 = dotproduct[j_neigh,j_neigh]
a_ij = dotproduct[i_neigh, j_neigh]
sim = a_ij /( a_i2 + a_j2 - a_ij)
cache[pair] = sim
return sim

def get_jaccard_index(self, i_neigh, j_neigh, dotproduct, degree, cache):
sim, pair = self.get_sim(i_neigh, j_neigh, cache)
if sim == None:
a_ij = dotproduct[i_neigh, j_neigh]
sim = a_ij /min(degree[i_neigh], degree[j_neigh])
cache[pair] = sim
return sim

def get_sim(self, i_neigh, j_neigh, cache):
pair = [i_neigh, j_neigh]
pair.sort()
pair = tuple(pair)
sim = cache.get(pair)
return sim, pair

def get_edge_id(self, a, b, e_dict):
e = [a, b]
e.sort()
edge_id = tuple(e)
e_index = e_dict.get(edge_id)
if e_index == None:
e_index = self.edge_counter
e_dict[edge_id] = e_index
self.edge_counter += 1
return e_index

def get_distance_condensed_vector(self, data, col_i, col_j):
edge_matrix_len = max([max(col_i), max(col_j)]) + 1 # Values in col_i and col_j are 0 based indexes so we need to add 1 to get the vector size
upper_triangle_size = (edge_matrix_len**2 - edge_matrix_len)//2
condensed_vector = np.ones(upper_triangle_size)
for idx, sim in enumerate(data): # m * i + j - ((i + 2) * (i + 1)) / 2 from https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
i = col_i[idx]
j = col_j[idx]
v = edge_matrix_len * i + j - ((i + 2) * (i + 1)) // 2
condensed_vector[v] = 1 - sim
return condensed_vector, edge_matrix_len

def get_clusters_by_partition_density(self, clustering, edge_len, edge_list, min_edges=None):
tree = {} # clust id : [member_ids]
edges_per_cluster = {} #clust_id : [ edge_tuples ]
partial_partition_densities = {} #clust_id : [ cluster_partition_density ]

counter = edge_len # this works as cluster id. This is used by the linkage method to tag the intermediate clusters: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
last_dist = None
constant = 2/edge_len
last_cluster_pool = []
max_pden = -10000000000
max_cluster_ids = []
for a_id, b_id, dist, n_members in clustering:
dist = round(dist, 5) # To make equal similar distances that differs in very low values
if last_dist != None and dist != last_dist: # We could have several clusters at the same dist, so we group the merge events to clculate the partition density
p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant)
if p_den > max_pden: # check the best partition density
max_pden = p_den
max_cluster_ids = last_cluster_pool
a_id = int(a_id) # Linkage method returns member ids as float instead of int
b_id = int(b_id)
member_list = self.get_member_list(counter, a_id, b_id, edge_len, tree) # members that we merge to build the new agglomerative cluster
nodes, edges = self.get_nodesNedges_per_cluster(member_list, edge_list)
edges_per_cluster[counter] = edges
partial_partition_densities[counter] = self.get_cluster_partial_partition_density(member_list, nodes)
last_cluster_pool = [ cl_id for cl_id in last_cluster_pool if cl_id not in [a_id, b_id] ] # update clusters removin merged cl ids and adding the new cluters ids
last_cluster_pool.append(counter)
last_dist = dist
counter += 1

p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) # update clusters removin merged cl ids and adding the new cluters ids
if p_den > max_pden: # check the best partition density on the last distance that not was checked
max_pden = p_den
max_cluster_ids = last_cluster_pool

final_clusters = [ ]
for cluster_id in max_cluster_ids:
members = edges_per_cluster[cluster_id]
if min_edges == None or len(members) >= min_edges:
final_clusters.append(members)
return final_clusters

def add_cluster_members(self, cluster, member_id, n_records, tree):
if member_id < n_records: # check if member_id is a cluster with only one member that is a original record. That id is less than n_records is the criteria described in https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
cluster.append(member_id)
else: # The id represents the merge of two previous clusters. We obtain the member list from the tree and remove it to merge in the it in the new cluster
cluster.extend(tree.pop(member_id))

def get_member_list(self, cluster_id, a_id, b_id, edge_len, tree):
member_list = []
self.add_cluster_members(member_list, a_id, edge_len, tree) # get cluster members from previous a cluster
self.add_cluster_members(member_list, b_id, edge_len, tree) # get cluster members from previous b cluster
tree[cluster_id] = member_list
return member_list

def get_nodesNedges_per_cluster(self, members, edge_list):
nodes = []
edges = []
for member in members:
edge = edge_list[member]
edges.append(edge)
nodes.extend(edge) # Add edge nodes to node list
return list(set(nodes)), edges

def get_cluster_partial_partition_density(self, edges, nodes):
n = len(nodes) #node number
m = len(edges) # link number
#return (m-(n-1))/(n*(n-1)/(2-(n-1))) #Ahn
return (m*(m-n+1))/((n-2)*(n-1)) #kalinka

def get_pden(self, last_cluster_pool, partial_partition_densities, constant):
partition_den_sum = sum([ partial_partition_densities[cl_id] for cl_id in last_cluster_pool]) #Partition density
p_den = constant * partition_den_sum
return p_den
10 changes: 10 additions & 0 deletions cdlib/test/test_edgeclustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ def test_to_json(self):
js = coms.to_json()
self.assertIsInstance(js, str)

coms = algorithms.hierarchical_link_community_w(g)
self.assertIsInstance(coms, EdgeClustering)
js = coms.to_json()
self.assertIsInstance(js, str)

coms = algorithms.hierarchical_link_community_full(g)
self.assertIsInstance(coms, EdgeClustering)
js = coms.to_json()
self.assertIsInstance(js, str)

def test_node_map(self):
g = nx.karate_club_graph()
coms = algorithms.hierarchical_link_community(g)
Expand Down
14 changes: 14 additions & 0 deletions cdlib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,20 @@ def remap_node_communities(communities: object, node_map: dict) -> list:
communities = cms
return communities

def remap_edge_communities(communities: object, node_map: dict) -> list: # ADDED TO HANDLE THIS CASE, VERSION FOR NODES CAN'T HANDLE THIS
"""Apply a map to the obtained communities to retreive the original node labels
:param communities: EdgeClustering object
:param node_map: dictionary <numeric_id, node_label>
:return: remapped communities
"""

cms = []
for community in communities:
community = [(node_map[e[0]], node_map[e[1]]) for e in community]
cms.append(community)
communities = cms
return communities

def affiliations2nodesets(affiliations: dict) -> dict:
"""
Expand Down

0 comments on commit 68ed449

Please sign in to comment.