Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove disconnected nodes from the graph #105

Merged
merged 2 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion sinr/graph_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pickle as pk

from networkit import Graph, components, community, setNumberOfThreads, getCurrentNumberOfThreads, getMaxNumberOfThreads, Partition
import networkit.graphtools as nkg
from numpy import argpartition, argsort, asarray, where, nonzero, concatenate, repeat, mean, nanmax, int64, shape, delete, nanmean
from sklearn.neighbors import NearestNeighbors
from scipy import spatial
Expand Down Expand Up @@ -48,6 +49,7 @@ def load_from_cooc_pkl(cls, cooc_matrix_path, n_jobs=-1):

word_to_idx, matrix = strategy_loader.load_pkl_text(cooc_matrix_path)
graph = get_graph_from_matrix(matrix)
graph, word_to_idx = get_compact_lgcc(graph, word_to_idx)
out_of_LgCC = get_lgcc(graph)
logger.info("Finished building graph.")
return cls(graph, out_of_LgCC, word_to_idx)
Expand All @@ -67,6 +69,7 @@ def load_from_adjacency_matrix(cls, matrix_object, labels=None, n_jobs=-1):
logger.info("Building Graph.")
word_to_idx, matrix = strategy_loader.load_adj_mat(matrix_object, labels)
graph = get_graph_from_matrix(matrix)
graph, word_to_idx = get_compact_lgcc(graph, word_to_idx)
out_of_LgCC = get_lgcc(graph)
logger.info("Finished building graph.")
return cls(graph, out_of_LgCC, word_to_idx)
Expand All @@ -87,6 +90,7 @@ def load_from_graph(cls, graph, n_jobs=-1):
for u in graph.iterNodes():
word_to_idx[u] = idx
idx += 1
graph, word_to_idx = get_compact_lgcc(graph, word_to_idx)
out_of_LgCC = get_lgcc(graph)
logger.info("Finished building graph.")
return cls(graph, out_of_LgCC, word_to_idx)
Expand Down Expand Up @@ -230,7 +234,7 @@ def get_out_of_LgCC_coms(self, communities):
set_out_of_LgCC = set(self.out_of_LgCC)
out_of_LgCC_coms = []
for com in communities.getSubsetIds():
if set(communities.getMembers()) & set_out_of_LgCC != {}:
if set(communities.getMembers(com)) & set_out_of_LgCC != {}:
out_of_LgCC_coms.append(com)
return out_of_LgCC_coms

Expand Down Expand Up @@ -311,6 +315,40 @@ def get_graph_from_matrix(matrix):
graph.addEdge(u=row, v=col, w=weight, addMissing=True)
return graph

def get_compact_lgcc(graph, word_to_idx):
"""Get a compacted graph with only nodes inside the largest connected component. Get the words with ids corresponding to the new node ids.

:param graph: The input graph
:type graph: networkit graph
:param word_to_idx: The words mapped to their initial ids
:type word_to_idx: dictionnary

:returns: The new graph and dictionnary of words
:rtype: networkit graph, dictionnary

"""

# search isolated nodes
isolated_nodes = list()
for u in graph.iterNodes():
if graph.degree(u) == 0:
isolated_nodes.append(u)

if len(isolated_nodes) != 0:
# remove nodes and corresponding words from the graph and dict of words
idx_to_word = _flip_keys_values(word_to_idx)
for u in isolated_nodes:
graph.removeNode(u)
del idx_to_word[u]
word_to_idx = _flip_keys_values(idx_to_word)
# change nodes ids to continuous ids
idx_map = nkg.getContinuousNodeIds(graph)
graph = nkg.getCompactedGraph(graph, idx_map)
# change words ids to continuous ids
word_to_idx = {k: idx_map[v] for k, v in word_to_idx.items()}

return graph, word_to_idx


class NoCommunityDetectedException(Exception):
"""Exception raised when no community detection has been performed thus leaving `self.communities` to its default value `None`. """
Expand Down
15 changes: 11 additions & 4 deletions tests/test_sinr_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def test_load_voc(self):

res = set(chain(*self.sentences))
# ['a', 'fun', 'is', 'package', 'python', 'sinr']
# The node of 'is' is outside the lgcc, 'is' is removed from the model vocabulary
# ['a', 'fun' 'package', 'python', 'sinr']
res.remove('is')
self.assertEquals(set(self.sinr_from_cooc.get_vocabulary()), res)

self.assertEquals(set(self.sinr_from_graph.get_vocabulary()), set(self.G.iterNodes()))
Expand All @@ -95,8 +98,11 @@ def test_detect_communities(self):
communities = communities.getVector()
# ['a', 'fun', 'is', 'package', 'python', 'sinr']
# [0 ,1 ,2 ,0 ,0 ,1]
self.assertAlmostEqual(rand_score([0,1,2,0,0,1], communities), 1)
self.assertAlmostEqual(rand_score([0,1,2,0,0,1], self.sinr_from_cooc.get_communities().getVector()), 1)

# the node of 'is' (ids 3) is disconnected from the lgcc, it is removed
# [0, 1, 0, 0, 1]
self.assertAlmostEqual(rand_score([0,1,0,0,1], communities), 1)
self.assertAlmostEqual(rand_score([0,1,0,0,1], self.sinr_from_cooc.get_communities().getVector()), 1)

def test_extract_embeddings(self):
communities = self.sinr_from_graph.detect_communities(gamma=1, inspect=False)
Expand All @@ -117,8 +123,9 @@ def test_extract_embeddings(self):
self.fail("Nr not equals to what is expected")

# Graphe : a-package-python, fun-sinr, is
# "is" of community 2 is not connected
ref = csr_matrix([[1, 0, 0],[0,1,0],[0, 0, 0], [1, 0, 0], [1, 0, 0], [0,1, 0]])
# "is" of community 2 is not connected, the node and the word are removed
# There are only 2 communities remaining
ref = csr_matrix([[1, 0],[0,1], [1, 0], [1, 0], [0,1]])
communities = self.sinr_from_cooc.detect_communities(gamma=1, inspect=False)
self.sinr_from_cooc.extract_embeddings(communities)
nr = self.sinr_from_cooc.get_nr()
Expand Down
Loading