Skip to content

Commit

Permalink
Add KeyedVecs and use it in word2vec and doc2vec (#980)
Browse files Browse the repository at this point in the history
* updated refactor
* commit missed file

* docstring added

* more refactoring

* add missing docstring

* fix docstring format

* clearer docstring

* minor typo in word2vec wmdistance

* pyemd error in keyedvecs

* relative import of keyedvecs from word2vec fails

* bug in init_sims in word2vec

* property descriptors for syn0, syn0norm, index2word, vocab - fixes bug in saving

* tests for loading older word2vec models

* backwards compatibility for loading older models

* test for syn0norm not saved to file

* syn0norm not saved to file for KeyedVectors

* tests and fix for accuracy

* minor bug in finalized vocab check

* warnings for direct syn0/syn0norm access

* fixes use of most_similar in accuracy

* changes logging level to ERROR in word2vec tests

* renames kv to wv in word2vec

* minor bugs with checking existence of syn0

* replaces syn0 and syn0norm with wv.syn0 and wv.syn0norm in tests and cython files

* adds changelog

* updates tests for loading word2vec models for different python versions

* Added separate word2vec model explicitly for python version 3.4.

* Added saved word2vec model for python 3.4 files

* Removed blank line in test_wikicorpus.py

* Increased window size in test_sg_hs_online

* PR #986 merged in wmd in keyedvectors.py

* Added deprecation warnings in Word2vec class attributes for future releases

* Merged rare/develop into keyedvecs removing conflicts.

* Merged rare/develop into keyedvecs and resolved conflicts.

* Changed numpy to np in test_word2vec.py

* Increased window size in test_cbow_hs_online

* Removed blank line in test_ldamodel, work around for Travis-CI issue #971

* Removed logging during import
  • Loading branch information
anmolgulati authored and tmylk committed Nov 13, 2016
1 parent 20bec2e commit 699773a
Show file tree
Hide file tree
Showing 29 changed files with 22,088 additions and 575 deletions.
16 changes: 8 additions & 8 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,16 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
"""
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
if word_locks is None:
word_locks = model.syn0_lockf
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
Expand Down Expand Up @@ -185,21 +185,21 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
"""
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
if word_locks is None:
word_locks = model.syn0_lockf
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)

null_word = model.vocab['\0']
null_word = model.wv.vocab['\0']
pre_pad_count = model.window
post_pad_count = model.window
padded_document_indexes = (
Expand All @@ -214,7 +214,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
+ padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words
)
word_context_len = len(word_context_indexes)
predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]]
# numpy advanced-indexing copies; concatenate, flatten to 1d
l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/doc2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down Expand Up @@ -405,7 +405,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down Expand Up @@ -567,7 +567,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down
Loading

0 comments on commit 699773a

Please sign in to comment.