Resolve merge conflict by incorporating both suggestions

SINr-Embeddings · Jul 22, 2024 · cab67bd · cab67bd
2 parents 8611e19 + 07e8ebc
commit cab67bd
Show file tree

Hide file tree

Showing 6 changed files with 217 additions and 1 deletion.
diff --git a/paper.bib b/paper.bib
@@ -0,0 +1,131 @@
+@inproceedings{prouteau2021sinr,
+  title={SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!},
+  author={Prouteau, Thibault and Connes, Victor and Dugu{\'e}, Nicolas and Perez, Anthony and Lamirel, Jean-Charles and Camelin, Nathalie and Meignier, Sylvain},
+  booktitle={IDA},
+  pages={325--337},
+  year={2021},
+  doi={10.1007/978-3-030-74251-5_26}
+}
+
+@inproceedings{prouteau2022embedding,
+  title={Are Embedding Spaces Interpretable? Results of an Intrusion Detection Evaluation on a Large French Corpus},
+  author={Prouteau, Thibault and Dugu{\'e}, Nicolas and Camelin, Nathalie and Meignier, Sylvain},
+  booktitle={LREC},
+  year={2022}
+}
+
+@article{mikolov2013efficient,
+  title={Efficient estimation of word representations in vector space},
+  author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
+  journal={arXiv preprint arXiv:1301.3781},
+  year={2013},
+  doi={10.48550/arXiv.1301.3781}
+}
+
+@inproceedings{pennington2014glove,
+  title={Glove: Global vectors for word representation},
+  author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
+  booktitle={EMNLP},
+  pages={1532--1543},
+  year={2014},
+  doi={10.3115/v1/D14-1162}
+}
+
+@article{le2019flaubert,
+  title={Flaubert: Unsupervised language model pre-training for french},
+  author={Le, Hang and Vial, Lo{\"\i}c and Frej, Jibril and Segonne, Vincent and Coavoux, Maximin and Lecouteux, Benjamin and Allauzen, Alexandre and Crabb{\'e}, Benoit and Besacier, Laurent and Schwab, Didier},
+  journal={arXiv preprint arXiv:1912.05372},
+  year={2019},
+  doi={10.48550/arXiv.1912.05372}
+}
+
+@article{martin2019camembert,
+  title={CamemBERT: a tasty French language model},
+  author={Martin, Louis and Muller, Benjamin and Su{\'a}rez, Pedro Javier Ortiz and Dupont, Yoann and Romary, Laurent and de La Clergerie, {\'E}ric Villemonte and Seddah, Djam{\'e} and Sagot, Beno{\^\i}t},
+  journal={arXiv preprint arXiv:1911.03894},
+  year={2019},
+  doi = {10.48550/arXiv.1911.03894}
+}
+
+@inproceedings{grover2016node2vec,
+  title={node2vec: Scalable feature learning for networks},
+  author={Grover, Aditya and Leskovec, Jure},
+  booktitle={SIGKDD},
+  pages={855--864},
+  year={2016},
+  doi = {10.1145/2939672.2939754}
+}
+
+@inproceedings{ou2016asymmetric,
+  title={Asymmetric transitivity preserving graph embedding},
+  author={Ou, Mingdong and Cui, Peng and Pei, Jian and Zhang, Ziwei and Zhu, Wenwu},
+  booktitle={SIGKDD},
+  pages={1105--1114},
+  year={2016},
+  doi={10.1145/2939672.2939751}
+}
+
+@inproceedings{cao2016deep,
+  title={Deep neural networks for learning graph representations},
+  author={Cao, Shaosheng and Lu, Wei and Xu, Qiongkai},
+  booktitle={AAAI conference on artificial intelligence},
+  volume={30},
+  number={1},
+  year={2016},
+  doi={10.1609/aaai.v30i1.10179}
+}
+
+@inproceedings{subramanian2018spine,
+  title={Spine: Sparse interpretable neural embeddings},
+  author={Subramanian, Anant and Pruthi, Danish and Jhamtani, Harsh and Berg-Kirkpatrick, Taylor and Hovy, Eduard},
+  booktitle={AAAI conference on artificial intelligence},
+  volume={32},
+  number={1},
+  year={2018},
+  doi={10.48550/arXiv.1711.08792}
+}
+
+@article{blondel2008fast,
+  title={Fast unfolding of communities in large networks},
+  author={Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne},
+  journal={Journal of statistical mechanics: theory and experiment},
+  volume={2008},
+  number={10},
+  pages={P10008},
+  year={2008},
+  doi={10.1088/1742-5468/2008/10/P10008}
+}
+
+@inproceedings{rozemberczki2020karate,
+  title={Karate Club: an API oriented open-source python framework for unsupervised learning on graphs},
+  author={Rozemberczki, Benedek and Kiss, Oliver and Sarkar, Rik},
+  booktitle={CIKM},
+  pages={3125--3132},
+  year={2020},
+  doi={10.1145/3340531.3412757}
+}
+
+@article{vrehuuvrek2011gensim,
+  title={Gensim—statistical semantics in python},
+  author={{\v{R}}eh{\uu}{\v{r}}ek, Radim and Sojka, Petr and others},
+  journal={Retrieved from genism.org},
+  year={2011},
+  publisher={Citeseer}
+}
+
+@inproceedings{akbik2019flair,
+  title={{FLAIR}: An easy-to-use framework for state-of-the-art {NLP}},
+  author={Akbik, Alan and Bergmann, Tanja and Blythe, Duncan and Rasul, Kashif and Schweter, Stefan and Vollgraf, Roland},
+  booktitle={{NAACL} (Demonstrations)},
+  pages={54--59},
+  year={2019},
+  doi = {10.18653/v1/N19-4010}
+}
+
+@book{vasiliev2020natural,
+  title={Natural language processing with Python and spaCy: A practical introduction},
+  author={Vasiliev, Yuli},
+  year={2020},
+  publisher={No Starch Press}
+}
+
diff --git a/paper.md b/paper.md
@@ -0,0 +1,85 @@
+
+---
+title: 'SINr: a python package to train interpretable word
+and graph embeddings'
+tags:
+  - Python
+  - computer science
+  - natural language processing
+  - graphs
+  - social networks analysis
+authors:
+  - name: Thibault Prouteau
+    orcid:  0000-0001-9610-3191
+    equal-contrib: false
+    corresponding: true
+    affiliation: 1
+  - name: Nicolas Dugué
+    orcid:  0000-0002-6150-1939
+    corresponding: true
+    equal-contrib: false
+    affiliation: 1
+  - name: Simon Guillot
+    equal-contrib: false
+    affiliation: 1
+  - name: Anthony Perez
+    corresponding: false
+    equal-contrib: false
+    affiliation: 2
+affiliations:
+ - name: Le Mans Université, LIUM, EA 4023, Laboratoire d'Informatique de l'Université du Mans, France
+   index: 1
+ - name: Univ. Orléans, INSA Centre Val de Loire, LIFO EA 4022, Orléans, France
+   index: 2
+
+date: 22 february 2023
+bibliography: paper.bib
+
+---
+
+# Summary
+
+In this paper, we introduce SINr, a Python package to train word and graph embeddings. Word embeddings allows to encompass the meaning of words into vectorial representations that can be used by algorithms. The principle was then transfered to the study of graphs, mathematical representations that represent entities of the real words and their relations (social networks for instance). Those vectors encompassing information are often the first building block of more complex artificial intelligence pipelines. With SINr, we focus on providing frugal and interpretable algorithms to build this first block. Indeed, because of the growing complexity of the algorithms used in artificial intelligence (deep learning), frugality and interpretability became major concerns. The SINr approach is based on community detection: a vector for a node is built upon the distribution of its connections through the communities detected on the graph at hand. The algorithm thus runs very efficiently, and does not require GPUs to proceed allowing a broad usage. Furthermore, the dimensions of the embedding space are interpretable, they are based on the communities extracted. One can thus interact with the dimensions of the vectors and inspect the meaning of the representation. 
+
+
+# Statement of need and functionalities
+
+The SINr package can be used in the field of natural language processing (NLP), digital humanities, or networks analysis. It allows to represent words and networks entities as vectors in mathematical spaces named embedding spaces.
+
+With neural approaches, tremendous progress was made in NLP, notably to represent the vocabulary of the language at hand. Those representations are then used as input for machine learning algorithms in the form of dense numeric vectors named word embeddings. Some examples of approaches to train such vectors using large textual corpora are Word2vec [@mikolov2013efficient], GloVe [@pennington2014glove], and the Transformer-based approaches for contextualized representations, CamemBERT [@martin2019camembert] or FlauBERT [@le2019flaubert] in French. This progress was transfered to the graph universe, allowing the emergence of graph embedding, a whole field of research with Word2vec inspired approaches such as Node2vec [@grover2016node2vec], matrix factorization methods like HOPE [@ou2016asymmetric] and auto-encoding paradigms [@cao2016deep].
+
+![Illustration of SINr, vertices are represented based on the communities they are linked to. \label{fig:2}](sinr_working.png)
+
+SINr was introduced to take advantage of this progress, it allows to embed words and nodes just like the aforementioned methods. However, it is based on community detection: for each node, the embedding vector is calculated as the proportion of its links going to the communities as described \autoref{fig:2}. This approach allows to avoid some flaws inherent to the usual approaches:
+
+- As far as we know, SINr is the first approach specifically designed to deal with both word and graph embeddings. Text corpora are represented as graphs, and with the adequate preprocessing provided by the package, word embeddings can easily be extracted with SINr. For graph embedding, no specific pre-processing is required.
+- Contrary to the neural approaches that require complex GPU calculations, SINr is based on the Louvain [@blondel2008fast]
+ algorithm to detect community and thus runs in linear-time, it can be executed on standalone laptops. Still, it can easily be interfaced with the gensim package for word embedding, or with the karateclub package for graph embedding.
+- Contrary to the usual approaches, because dimensions are based on the communities, the space in which words and graphs are embedded with SINr is interpretable.
+
+Design patterns were used to allow the usage of SINr with graphs or text in the same manner, which makes it -as far as we know- the first package to deal by design with both of these data types. For usage in the context of digital humanities or NLP, standard preprocessing algorithms are also provided, mostly relying on spaCy. Finally, the most important aspect of SINr is that it allows interaction with the embedding space. SINr brings the option to probe and get an understanding of the resulting embedding space as one can see \autoref{fig:1}. Indeed, using SINr leads to sparse vectors: a node is not connected to all the communities of a graph, and similarly, a word is not related to all the topics of a corpus. As shown by @subramanian2018spine, sparsity is one of the features required to enforce interpretability. 
+
+![Using the visualization features of the package, one can see that related words have non-zero values for the same dimensions (abscissa), mother and father for dimensions 4, 7, 8, 9 and 11 for instance. The non-zero dimensions are distinct when comparing mother and car that are unrelated.\label{fig:1}](sinr.png)
+
+
+
+
+# Performances of the SINr approach
+The performances of SINr were evaluated on several tasks, including link prediction on graphs, and pair of words similarities for textual data in @prouteau2021sinr. While providing good performances, it runs faster than most of the other embedding approaches. Furthermore, the interpretability of the model was also demonstrated to be comparable to the state-of-the-art when considering word embedding [@prouteau2022embedding]. 
+
+# Availability
+
+The SINr package is distributed on \url{https://github.com/SINr-Embeddings/sinr} with its [documentation](https://sinr-embeddings.github.io/sinr/) and [notebooks](https://github.com/SINr-Embeddings/sinr/tree/main/notebooks). It can also be found on [Pypi](https://pypi.org/project/sinr/) to be installed with pip.
+
+# Scope and Alternatives
+
+For graph embedding, the Karateclub library designed by @rozemberczki2020karate allows to try and test recent approaches. However, so far, none of them is known to be as fast as SINr or interpretable. For statistical semantics on textual corpora, the gensim package [@vrehuuvrek2011gensim] is still a good option. For more recent approaches, the Flair [@akbik2019flair] and spaCy [@vasiliev2020natural] packages can be used, allowing to build complete NLP pipelines. SINr embeddings can actually be used complementary to these packages.
+
+
+
+# Acknowledgements
+
+This research benefited from the support of the ANR project 'Dynamic and Interpretable Graph-based word embeddINGs' (ANR-21-CE23-0010).
+
+# References
+
diff --git a/paper.pdf b/paper.pdf
diff --git a/sinr.png b/sinr.png
diff --git a/sinr/graph_embeddings.py b/sinr/graph_embeddings.py
@@ -1569,4 +1569,4 @@ def light_model_save(self):
 
 class DimensionFilteredException(Exception):
     """Exception raised when trying to access a dimension removed by filtering. """
-    pass
+    pass
diff --git a/sinr_working.png b/sinr_working.png
-Original file line number
+Diff line change
@@ Expand Up / @@ -1569,4 +1569,4 @@ def light_model_save(self): @@
     class DimensionFilteredException(Exception):
         """Exception raised when trying to access a dimension removed by filtering. """
-        pass
+        pass