Add KeyedVecs and use it in word2vec and doc2vec (#980)

* updated refactor * commit missed file * docstring added * more refactoring * add missing docstring * fix docstring format * clearer docstring * minor typo in word2vec wmdistance * pyemd error in keyedvecs * relative import of keyedvecs from word2vec fails * bug in init_sims in word2vec * property descriptors for syn0, syn0norm, index2word, vocab - fixes bug in saving * tests for loading older word2vec models * backwards compatibility for loading older models * test for syn0norm not saved to file * syn0norm not saved to file for KeyedVectors * tests and fix for accuracy * minor bug in finalized vocab check * warnings for direct syn0/syn0norm access * fixes use of most_similar in accuracy * changes logging level to ERROR in word2vec tests * renames kv to wv in word2vec * minor bugs with checking existence of syn0 * replaces syn0 and syn0norm with wv.syn0 and wv.syn0norm in tests and cython files * adds changelog * updates tests for loading word2vec models for different python versions * Added separate word2vec model explicitly for python version 3.4. * Added saved word2vec model for python 3.4 files * Removed blank line in test_wikicorpus.py * Increased window size in test_sg_hs_online * PR #986 merged in wmd in keyedvectors.py * Added deprecation warnings in Word2vec class attributes for future releases * Merged rare/develop into keyedvecs removing conflicts. * Merged rare/develop into keyedvecs and resolved conflicts. * Changed numpy to np in test_word2vec.py * Increased window size in test_cbow_hs_online * Removed blank line in test_ldamodel, work around for Travis-CI issue #971 * Removed logging during import
piskvorky · Nov 13, 2016 · 699773a · 699773a
1 parent 20bec2e
commit 699773a
Show file tree

Hide file tree

Showing 29 changed files with 22,088 additions and 575 deletions.
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -130,16 +130,16 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
 
         """
         if word_vectors is None:
-            word_vectors = model.syn0
+            word_vectors = model.wv.syn0
         if word_locks is None:
             word_locks = model.syn0_lockf
         if doctag_vectors is None:
             doctag_vectors = model.docvecs.doctag_syn0
         if doctag_locks is None:
             doctag_locks = model.docvecs.doctag_syn0_lockf
 
-        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
-                       model.vocab[w].sample_int > model.random.rand() * 2**32]
+        word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
+                       model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
 
         for pos, word in enumerate(word_vocabs):
             reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
@@ -185,21 +185,21 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
 
         """
         if word_vectors is None:
-            word_vectors = model.syn0
+            word_vectors = model.wv.syn0
         if word_locks is None:
             word_locks = model.syn0_lockf
         if doctag_vectors is None:
             doctag_vectors = model.docvecs.doctag_syn0
         if doctag_locks is None:
             doctag_locks = model.docvecs.doctag_syn0_lockf
 
-        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
-                       model.vocab[w].sample_int > model.random.rand() * 2**32]
+        word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
+                       model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
         doctag_len = len(doctag_indexes)
         if doctag_len != model.dm_tag_count:
             return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)
 
-        null_word = model.vocab['\0']
+        null_word = model.wv.vocab['\0']
         pre_pad_count = model.window
         post_pad_count = model.window
         padded_document_indexes = (
@@ -214,7 +214,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
                 + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)]  # following words
             )
             word_context_len = len(word_context_indexes)
-            predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
+            predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]]
             # numpy advanced-indexing copies; concatenate, flatten to 1d
             l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
             neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,

diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx
@@ -268,7 +268,7 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
 
     # default vectors, locks from syn0/doctag_syn0
     if word_vectors is None:
-       word_vectors = model.syn0
+       word_vectors = model.wv.syn0
     _word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
     if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
@@ -405,7 +405,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
 
     # default vectors, locks from syn0/doctag_syn0
     if word_vectors is None:
-       word_vectors = model.syn0
+       word_vectors = model.wv.syn0
     _word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
     if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
@@ -567,7 +567,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
 
     # default vectors, locks from syn0/doctag_syn0
     if word_vectors is None:
-       word_vectors = model.syn0
+       word_vectors = model.wv.syn0
     _word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
     if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0