Skip to content

Commit

Permalink
Resolve merge conflict by incorporating both suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolasdugue committed Jul 22, 2024
2 parents 8611e19 + 07e8ebc commit cab67bd
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 1 deletion.
131 changes: 131 additions & 0 deletions paper.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
@inproceedings{prouteau2021sinr,
title={SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!},
author={Prouteau, Thibault and Connes, Victor and Dugu{\'e}, Nicolas and Perez, Anthony and Lamirel, Jean-Charles and Camelin, Nathalie and Meignier, Sylvain},
booktitle={IDA},
pages={325--337},
year={2021},
doi={10.1007/978-3-030-74251-5_26}
}

@inproceedings{prouteau2022embedding,
title={Are Embedding Spaces Interpretable? Results of an Intrusion Detection Evaluation on a Large French Corpus},
author={Prouteau, Thibault and Dugu{\'e}, Nicolas and Camelin, Nathalie and Meignier, Sylvain},
booktitle={LREC},
year={2022}
}

@article{mikolov2013efficient,
title={Efficient estimation of word representations in vector space},
author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
journal={arXiv preprint arXiv:1301.3781},
year={2013},
doi={10.48550/arXiv.1301.3781}
}

@inproceedings{pennington2014glove,
title={Glove: Global vectors for word representation},
author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
booktitle={EMNLP},
pages={1532--1543},
year={2014},
doi={10.3115/v1/D14-1162}
}

@article{le2019flaubert,
title={Flaubert: Unsupervised language model pre-training for french},
author={Le, Hang and Vial, Lo{\"\i}c and Frej, Jibril and Segonne, Vincent and Coavoux, Maximin and Lecouteux, Benjamin and Allauzen, Alexandre and Crabb{\'e}, Benoit and Besacier, Laurent and Schwab, Didier},
journal={arXiv preprint arXiv:1912.05372},
year={2019},
doi={10.48550/arXiv.1912.05372}
}

@article{martin2019camembert,
title={CamemBERT: a tasty French language model},
author={Martin, Louis and Muller, Benjamin and Su{\'a}rez, Pedro Javier Ortiz and Dupont, Yoann and Romary, Laurent and de La Clergerie, {\'E}ric Villemonte and Seddah, Djam{\'e} and Sagot, Beno{\^\i}t},
journal={arXiv preprint arXiv:1911.03894},
year={2019},
doi = {10.48550/arXiv.1911.03894}
}

@inproceedings{grover2016node2vec,
title={node2vec: Scalable feature learning for networks},
author={Grover, Aditya and Leskovec, Jure},
booktitle={SIGKDD},
pages={855--864},
year={2016},
doi = {10.1145/2939672.2939754}
}

@inproceedings{ou2016asymmetric,
title={Asymmetric transitivity preserving graph embedding},
author={Ou, Mingdong and Cui, Peng and Pei, Jian and Zhang, Ziwei and Zhu, Wenwu},
booktitle={SIGKDD},
pages={1105--1114},
year={2016},
doi={10.1145/2939672.2939751}
}

@inproceedings{cao2016deep,
title={Deep neural networks for learning graph representations},
author={Cao, Shaosheng and Lu, Wei and Xu, Qiongkai},
booktitle={AAAI conference on artificial intelligence},
volume={30},
number={1},
year={2016},
doi={10.1609/aaai.v30i1.10179}
}

@inproceedings{subramanian2018spine,
title={Spine: Sparse interpretable neural embeddings},
author={Subramanian, Anant and Pruthi, Danish and Jhamtani, Harsh and Berg-Kirkpatrick, Taylor and Hovy, Eduard},
booktitle={AAAI conference on artificial intelligence},
volume={32},
number={1},
year={2018},
doi={10.48550/arXiv.1711.08792}
}

@article{blondel2008fast,
title={Fast unfolding of communities in large networks},
author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne},
journal={Journal of statistical mechanics: theory and experiment},
volume={2008},
number={10},
pages={P10008},
year={2008},
doi={10.1088/1742-5468/2008/10/P10008}
}

@inproceedings{rozemberczki2020karate,
title={Karate Club: an API oriented open-source python framework for unsupervised learning on graphs},
author={Rozemberczki, Benedek and Kiss, Oliver and Sarkar, Rik},
booktitle={CIKM},
pages={3125--3132},
year={2020},
doi={10.1145/3340531.3412757}
}

@article{vrehuuvrek2011gensim,
title={Gensim—statistical semantics in python},
author={{\v{R}}eh{\uu}{\v{r}}ek, Radim and Sojka, Petr and others},
journal={Retrieved from genism.org},
year={2011},
publisher={Citeseer}
}

@inproceedings{akbik2019flair,
title={{FLAIR}: An easy-to-use framework for state-of-the-art {NLP}},
author={Akbik, Alan and Bergmann, Tanja and Blythe, Duncan and Rasul, Kashif and Schweter, Stefan and Vollgraf, Roland},
booktitle={{NAACL} (Demonstrations)},
pages={54--59},
year={2019},
doi = {10.18653/v1/N19-4010}
}

@book{vasiliev2020natural,
title={Natural language processing with Python and spaCy: A practical introduction},
author={Vasiliev, Yuli},
year={2020},
publisher={No Starch Press}
}

85 changes: 85 additions & 0 deletions paper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@

---
title: 'SINr: a python package to train interpretable word
and graph embeddings'
tags:
- Python
- computer science
- natural language processing
- graphs
- social networks analysis
authors:
- name: Thibault Prouteau
orcid: 0000-0001-9610-3191
equal-contrib: false
corresponding: true
affiliation: 1
- name: Nicolas Dugué
orcid: 0000-0002-6150-1939
corresponding: true
equal-contrib: false
affiliation: 1
- name: Simon Guillot
equal-contrib: false
affiliation: 1
- name: Anthony Perez
corresponding: false
equal-contrib: false
affiliation: 2
affiliations:
- name: Le Mans Université, LIUM, EA 4023, Laboratoire d'Informatique de l'Université du Mans, France
index: 1
- name: Univ. Orléans, INSA Centre Val de Loire, LIFO EA 4022, Orléans, France
index: 2

date: 22 february 2023
bibliography: paper.bib

---

# Summary

In this paper, we introduce SINr, a Python package to train word and graph embeddings. Word embeddings allows to encompass the meaning of words into vectorial representations that can be used by algorithms. The principle was then transfered to the study of graphs, mathematical representations that represent entities of the real words and their relations (social networks for instance). Those vectors encompassing information are often the first building block of more complex artificial intelligence pipelines. With SINr, we focus on providing frugal and interpretable algorithms to build this first block. Indeed, because of the growing complexity of the algorithms used in artificial intelligence (deep learning), frugality and interpretability became major concerns. The SINr approach is based on community detection: a vector for a node is built upon the distribution of its connections through the communities detected on the graph at hand. The algorithm thus runs very efficiently, and does not require GPUs to proceed allowing a broad usage. Furthermore, the dimensions of the embedding space are interpretable, they are based on the communities extracted. One can thus interact with the dimensions of the vectors and inspect the meaning of the representation.


# Statement of need and functionalities

The SINr package can be used in the field of natural language processing (NLP), digital humanities, or networks analysis. It allows to represent words and networks entities as vectors in mathematical spaces named embedding spaces.

With neural approaches, tremendous progress was made in NLP, notably to represent the vocabulary of the language at hand. Those representations are then used as input for machine learning algorithms in the form of dense numeric vectors named word embeddings. Some examples of approaches to train such vectors using large textual corpora are Word2vec [@mikolov2013efficient], GloVe [@pennington2014glove], and the Transformer-based approaches for contextualized representations, CamemBERT [@martin2019camembert] or FlauBERT [@le2019flaubert] in French. This progress was transfered to the graph universe, allowing the emergence of graph embedding, a whole field of research with Word2vec inspired approaches such as Node2vec [@grover2016node2vec], matrix factorization methods like HOPE [@ou2016asymmetric] and auto-encoding paradigms [@cao2016deep].

![Illustration of SINr, vertices are represented based on the communities they are linked to. \label{fig:2}](sinr_working.png)

SINr was introduced to take advantage of this progress, it allows to embed words and nodes just like the aforementioned methods. However, it is based on community detection: for each node, the embedding vector is calculated as the proportion of its links going to the communities as described \autoref{fig:2}. This approach allows to avoid some flaws inherent to the usual approaches:

- As far as we know, SINr is the first approach specifically designed to deal with both word and graph embeddings. Text corpora are represented as graphs, and with the adequate preprocessing provided by the package, word embeddings can easily be extracted with SINr. For graph embedding, no specific pre-processing is required.
- Contrary to the neural approaches that require complex GPU calculations, SINr is based on the Louvain [@blondel2008fast]
algorithm to detect community and thus runs in linear-time, it can be executed on standalone laptops. Still, it can easily be interfaced with the gensim package for word embedding, or with the karateclub package for graph embedding.
- Contrary to the usual approaches, because dimensions are based on the communities, the space in which words and graphs are embedded with SINr is interpretable.

Design patterns were used to allow the usage of SINr with graphs or text in the same manner, which makes it -as far as we know- the first package to deal by design with both of these data types. For usage in the context of digital humanities or NLP, standard preprocessing algorithms are also provided, mostly relying on spaCy. Finally, the most important aspect of SINr is that it allows interaction with the embedding space. SINr brings the option to probe and get an understanding of the resulting embedding space as one can see \autoref{fig:1}. Indeed, using SINr leads to sparse vectors: a node is not connected to all the communities of a graph, and similarly, a word is not related to all the topics of a corpus. As shown by @subramanian2018spine, sparsity is one of the features required to enforce interpretability.

![Using the visualization features of the package, one can see that related words have non-zero values for the same dimensions (abscissa), mother and father for dimensions 4, 7, 8, 9 and 11 for instance. The non-zero dimensions are distinct when comparing mother and car that are unrelated.\label{fig:1}](sinr.png)




# Performances of the SINr approach
The performances of SINr were evaluated on several tasks, including link prediction on graphs, and pair of words similarities for textual data in @prouteau2021sinr. While providing good performances, it runs faster than most of the other embedding approaches. Furthermore, the interpretability of the model was also demonstrated to be comparable to the state-of-the-art when considering word embedding [@prouteau2022embedding].

# Availability

The SINr package is distributed on \url{https://github.com/SINr-Embeddings/sinr} with its [documentation](https://sinr-embeddings.github.io/sinr/) and [notebooks](https://github.com/SINr-Embeddings/sinr/tree/main/notebooks). It can also be found on [Pypi](https://pypi.org/project/sinr/) to be installed with pip.

# Scope and Alternatives

For graph embedding, the Karateclub library designed by @rozemberczki2020karate allows to try and test recent approaches. However, so far, none of them is known to be as fast as SINr or interpretable. For statistical semantics on textual corpora, the gensim package [@vrehuuvrek2011gensim] is still a good option. For more recent approaches, the Flair [@akbik2019flair] and spaCy [@vasiliev2020natural] packages can be used, allowing to build complete NLP pipelines. SINr embeddings can actually be used complementary to these packages.



# Acknowledgements

This research benefited from the support of the ANR project 'Dynamic and Interpretable Graph-based word embeddINGs' (ANR-21-CE23-0010).

# References

Binary file added paper.pdf
Binary file not shown.
Binary file added sinr.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion sinr/graph_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1569,4 +1569,4 @@ def light_model_save(self):

class DimensionFilteredException(Exception):
"""Exception raised when trying to access a dimension removed by filtering. """
pass
pass
Binary file added sinr_working.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit cab67bd

Please sign in to comment.