Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nmslib indexer #2417

Merged
merged 22 commits into from
Jul 7, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 217 additions & 0 deletions gensim/similarities/nmslib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
#!/usr/bin/env python
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
# Copyright (C) 2019 Masahiro Kazama <kazama.masa@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Intro
-----
This module contains integration Nmslib with :class:`~gensim.models.word2vec.Word2Vec`,
:class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText` and
:class:`~gensim.models.keyedvectors.KeyedVectors`.


What is Nmslib
-------------
Non-Metric Space Library (NMSLIB) is an efficient cross-platform similarity search library and a toolkit
for evaluation of similarity search methods. The core-library does not have any third-party dependencies.


How it works
------------
Searching in generic non-metric space.

More information about Nmslib: `github repository <https://github.com/nmslib/nmslib>`_.

"""
import os

from smart_open import smart_open
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
try:
import cPickle as _pickle
except ImportError:
import pickle as _pickle

from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
try:
import nmslib
except ImportError:
raise ImportError(
"Nmslib has not been installed, if you wish to use the nmslib indexer, please run `pip install nmslib`"
)


class NmslibIndexer(object):
"""This class allows to use `Nmslib <https://github.com/nmslib/nmslib>`_ as indexer for `most_similar` method
from :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec`,
:class:`~gensim.models.fasttext.FastText` and :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` classes.

"""

def __init__(self, model=None, index_params=None, query_time_params=None):
"""
Parameters
----------
model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`, optional
Model, that will be used as source for index.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if model is None? It may be worth including an example showing this use case, if it is valid.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the model is None, index and labels are not initialized. In that case, a user should load or init the index and labels by themselves. I add this information to doc string.
Also this is used by load function with model=None.

index_params : dict, optional
index_params for Nmslib indexer.
query_time_params : dict, optional
query_time_params for Nmslib indexer.

Examples
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
--------
.. sourcecode:: pycon

>>> from gensim.similarities.nmslib import NmslibIndexer
>>> from gensim.models import Word2Vec
>>>
>>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
>>> model = Word2Vec(sentences, min_count=1, seed=1)
>>>
>>> indexer = NmslibIndexer(model)
>>> model.most_similar("cat", topn=2, indexer=indexer)
[('cat', 1.0), ('meow', 0.5595494508743286)]

"""
if index_params is None:
index_params = {'M': 10, 'indexThreadQty': 1, 'efConstruction': 100, 'post': 0}
if query_time_params is None:
query_time_params = {'efSearch': 100}

self.index = None
self.labels = None
self.model = model
self.index_params = index_params
self.query_time_params = query_time_params

if model:
if isinstance(self.model, Doc2Vec):
self.build_from_doc2vec()
elif isinstance(self.model, (Word2Vec, FastText)):
self.build_from_word2vec()
elif isinstance(self.model, (WordEmbeddingsKeyedVectors, KeyedVectors)):
self.build_from_keyedvectors()
else:
raise ValueError("Only a Word2Vec, Doc2Vec, FastText or KeyedVectors instance can be used")
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

def save(self, fname, protocol=2):
"""Save NmslibIndexer instance.
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
fname : str
Path to output file,
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
will produce 2 files: `fname` - parameters and `fname`.d - :class:`~nmslib.NmslibIndex`.
protocol : int, optional
Protocol for pickle.

Notes
-----
This method save **only** index (**model isn't preserved**).
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

"""
fname_dict = fname + '.d'
self.index.saveIndex(fname)
d = {'index_params': self.index_params, 'query_time_params': self.query_time_params, 'labels': self.labels}
with smart_open(fname_dict, 'wb') as fout:
_pickle.dump(d, fout, protocol=protocol)

def load(self, fname):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gensim follows the pattern of using class method for loading, e.g.

model = FastText.load('/path/to/file')

instead of

model = FastText()
model.load('/path/to/file')

Please adjust your load function and its documentation to be a class method.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this to use the class method.
BTW, annoy indexer doesn't use the class method for loading. So should we fix this in the future?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but let's do that in a separate PR.

"""Load NmslibIndexer instance
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
fname : str
Path to dump with NmslibIndexer.

Examples
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
--------
.. sourcecode:: pycon

>>> from gensim.similarities.nmslib import NmslibIndexer
>>> from gensim.models import Word2Vec
>>> from tempfile import mkstemp
>>>
>>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
>>> model = Word2Vec(sentences, min_count=1, seed=1, iter=10)
>>>
>>> indexer = NmslibIndexer(model)
>>> _, temp_fn = mkstemp()
>>> indexer.save(temp_fn)
>>>
>>> new_indexer = NmslibIndexer()
>>> new_indexer.load(temp_fn)
>>> new_indexer.model = model

"""
fname_dict = fname + '.d'
if not (os.path.exists(fname) and os.path.exists(fname_dict)):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
raise IOError(
"Can't find index files '%s' and '%s' - Unable to restore NmslibIndexer state." % (fname, fname_dict)
)
else:
with smart_open(fname_dict) as f:
d = _pickle.loads(f.read())
self.index_params = d['index_params']
self.query_time_params = d['query_time_params']
self.index = nmslib.init()
self.index.loadIndex(fname)
self.labels = d['labels']

def build_from_word2vec(self):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
"""Build an Nmslib index using word vectors from a Word2Vec model."""

self.model.init_sims()
return self._build_from_model(self.model.wv.vectors_norm, self.model.wv.index2word)
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

def build_from_doc2vec(self):
"""Build an Nmslib index using document vectors from a Doc2Vec model."""

docvecs = self.model.docvecs
docvecs.init_sims()
labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)]
return self._build_from_model(docvecs.vectors_docs_norm, labels)

def build_from_keyedvectors(self):
"""Build an Nmslib index using word vectors from a KeyedVectors model."""

self.model.init_sims()
return self._build_from_model(self.model.vectors_norm, self.model.index2word)

def _build_from_model(self, vectors, labels):
index = nmslib.init()
index.addDataPointBatch(vectors)

index.createIndex(self.index_params, print_progress=True)
nmslib.setQueryTimeParams(index, self.query_time_params)

self.index = index
self.labels = labels
print("build index")

def most_similar(self, vector, num_neighbors):
"""Find the approximate `num_neighbors` most similar items.

Parameters
----------
vector : numpy.array
Vector for word/document.
num_neighbors : int
Number of most similar items

Returns
-------
list of (str, float)
List of most similar items in format [(`item`, `cosine_distance`), ... ]

"""
ids, distances = self.index.knnQueryBatch(vector.reshape(1, -1), k=num_neighbors)[0]

return [(self.labels[ids[i]], 1 - distances[i] / 2) for i in range(len(ids))]
154 changes: 154 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,160 @@ def testSaveLoad(self):
self.assertEqual(self.index.num_trees, self.index2.num_trees)


class TestWord2VecNmslibIndexer(unittest.TestCase):

def setUp(self):
try:
import nmslib # noqa:F401
except ImportError:
raise unittest.SkipTest("Nmslib library is not available")

from gensim.similarities.nmslib import NmslibIndexer
self.indexer = NmslibIndexer

def testWord2Vec(self):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
model = word2vec.Word2Vec(texts, min_count=1)
model.init_sims()
index = self.indexer(model)

self.assertVectorIsSimilarToItself(model.wv, index)
self.assertApproxNeighborsMatchExact(model, model.wv, index)
self.assertIndexSaved(index)
self.assertLoadedIndexEqual(index, model)

def testFastText(self):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
class LeeReader(object):
def __init__(self, fn):
self.fn = fn

def __iter__(self):
with smart_open(self.fn, 'r', encoding="latin_1") as infile:
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model.init_sims()
index = self.indexer(model)

self.assertVectorIsSimilarToItself(model.wv, index)
self.assertApproxNeighborsMatchExact(model, model.wv, index)
self.assertIndexSaved(index)
self.assertLoadedIndexEqual(index, model)

def testNmslibIndexingOfKeyedVectors(self):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
from gensim.similarities.nmslib import NmslibIndexer
keyVectors_file = datapath('lee_fasttext.vec')
model = KeyedVectors.load_word2vec_format(keyVectors_file)
index = NmslibIndexer(model)

self.assertVectorIsSimilarToItself(model, index)
self.assertApproxNeighborsMatchExact(model, model, index)

def testLoadMissingRaisesError(self):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
from gensim.similarities.nmslib import NmslibIndexer
test_index = NmslibIndexer()

self.assertRaises(IOError, test_index.load, fname='test-index')

def assertVectorIsSimilarToItself(self, wv, index):
vector = wv.vectors_norm[0]
label = wv.index2word[0]
approx_neighbors = index.most_similar(vector, 1)
word, similarity = approx_neighbors[0]

self.assertEqual(word, label)
self.assertAlmostEqual(similarity, 1.0, places=2)

def assertApproxNeighborsMatchExact(self, model, wv, index):
vector = wv.vectors_norm[0]
approx_neighbors = model.wv.most_similar([vector], topn=5, indexer=index)
exact_neighbors = model.wv.most_similar(positive=[vector], topn=5)

approx_words = [neighbor[0] for neighbor in approx_neighbors]
exact_words = [neighbor[0] for neighbor in exact_neighbors]

self.assertEqual(approx_words, exact_words)

def assertIndexSaved(self, index):
fname = get_tmpfile('gensim_similarities.tst.pkl')
index.save(fname)
self.assertTrue(os.path.exists(fname))
self.assertTrue(os.path.exists(fname + '.d'))

def assertLoadedIndexEqual(self, index, model):
from gensim.similarities.nmslib import NmslibIndexer

fname = get_tmpfile('gensim_similarities.tst.pkl')
index.save(fname)

index2 = NmslibIndexer()
index2.load(fname)
index2.model = model

self.assertEqual(index.labels, index2.labels)
self.assertEqual(index.index_params, index2.index_params)
self.assertEqual(index.query_time_params, index2.query_time_params)


class TestDoc2VecNmslibIndexer(unittest.TestCase):

def setUp(self):
try:
import nmslib # noqa:F401
except ImportError:
raise unittest.SkipTest("Nmslib library is not available")

from gensim.similarities.nmslib import NmslibIndexer

self.model = doc2vec.Doc2Vec(sentences, min_count=1)
self.model.init_sims()
self.index = NmslibIndexer(self.model)
self.vector = self.model.docvecs.vectors_docs_norm[0]

def testDocumentIsSimilarToItself(self):
approx_neighbors = self.index.most_similar(self.vector, 1)
doc, similarity = approx_neighbors[0]

self.assertEqual(doc, 0)
self.assertAlmostEqual(similarity, 1.0, places=2)

def testApproxNeighborsMatchExact(self):
approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
exact_neighbors = self.model.docvecs.most_similar(
positive=[self.vector], topn=5)

approx_words = [neighbor[0] for neighbor in approx_neighbors]
exact_words = [neighbor[0] for neighbor in exact_neighbors]

self.assertEqual(approx_words, exact_words)

def testSave(self):
fname = get_tmpfile('gensim_similarities.tst.pkl')
self.index.save(fname)
self.assertTrue(os.path.exists(fname))
self.assertTrue(os.path.exists(fname + '.d'))

def testLoadNotExist(self):
from gensim.similarities.nmslib import NmslibIndexer
self.test_index = NmslibIndexer()

self.assertRaises(IOError, self.test_index.load, fname='test-index')

def testSaveLoad(self):
from gensim.similarities.nmslib import NmslibIndexer

fname = get_tmpfile('gensim_similarities.tst.pkl')
self.index.save(fname)

self.index2 = NmslibIndexer()
self.index2.load(fname)
self.index2.model = self.model

self.assertEqual(self.index.labels, self.index2.labels)
self.assertEqual(self.index.index_params, self.index2.index_params)
self.assertEqual(self.index.query_time_params, self.index2.query_time_params)


class TestUniformTermSimilarityIndex(unittest.TestCase):
def setUp(self):
self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,12 @@ def finalize_options(self):
linux_testenv.extend([
'tensorflow <= 1.3.0',
'keras >= 2.0.4, <= 2.1.4',
'annoy',
'annoy'
])

if (3, 0) < sys.version_info < (3, 7):
linux_testenv.extend(['nmslib'])

ext_modules = [
Extension('gensim.models.word2vec_inner',
sources=['./gensim/models/word2vec_inner.c'],
Expand Down