Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes issues while loading word2vec and doc2vec models saved using old Gensim versions. Fix #2000, #1977 #2012

Merged
merged 13 commits into from
Apr 12, 2018
Merged
35 changes: 23 additions & 12 deletions gensim/models/deprecated/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@

from gensim import utils
from gensim.utils import call_on_class_only, deprecated
from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg
from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg,\
MAX_WORDS_IN_BATCH
from gensim.models.deprecated.keyedvectors import KeyedVectors
from gensim.models.doc2vec import Doc2Vec as NewDoc2Vec
from gensim.models.deprecated.old_saveload import SaveLoad
Expand Down Expand Up @@ -104,8 +105,8 @@ def load_old_doc2vec(*args, **kwargs):
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
'iter': old_model.iter,
'sorted_vocab': old_model.sorted_vocab,
'batch_words': old_model.batch_words,
'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
'compute_loss': old_model.__dict__.get('compute_loss', None)
}
new_model = NewDoc2Vec(**params)
Expand Down Expand Up @@ -136,16 +137,26 @@ def load_old_doc2vec(*args, **kwargs):

# set doc2vec vocabulary attributes
new_model.docvecs.doctags = old_model.docvecs.doctags
new_model.docvecs.max_rawint = old_model.docvecs.max_rawint
new_model.docvecs.offset2doctag = old_model.docvecs.offset2doctag
new_model.docvecs.count = old_model.docvecs.count

new_model.train_count = old_model.train_count
new_model.corpus_count = old_model.corpus_count
new_model.running_training_loss = old_model.running_training_loss
new_model.total_train_time = old_model.total_train_time
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
new_model.model_trimmed_post_training = old_model.model_trimmed_post_training
if hasattr(old_model.docvecs, 'max_rawint'): # `doc2vec` models before `0.12.3` do not have these 2 attributes
new_model.docvecs.max_rawint = old_model.docvecs.__dict__.get('max_rawint')
new_model.docvecs.offset2doctag = old_model.docvecs.__dict__.get('offset2doctag')
else:
# Doc2Vec models before Gensim version 0.12.3 did not have `max_rawint` and `offset2doctag` as they did not
# mixing of string and int tags. This implies the new attribute `offset2doctag` equals the old `index2doctag`
# (which was only filled if the documents had string tags).
# This also implies that the new attribute, `max_rawint`(highest rawint-indexed doctag) would either be equal
# to the initial value -1, in case only string tags are used or would be equal to `count` if only int indexing
# was used.
new_model.docvecs.max_rawint = -1 if old_model.docvecs.index2doctag else old_model.docvecs.count - 1
new_model.docvecs.offset2doctag = old_model.docvecs.index2doctag

new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None)

return new_model

Expand Down
27 changes: 14 additions & 13 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,25 +151,26 @@

def load_old_word2vec(*args, **kwargs):
old_model = Word2Vec.load(*args, **kwargs)
vector_size = getattr(old_model, 'vector_size', old_model.layer1_size)
params = {
'size': old_model.vector_size,
'size': vector_size,
'alpha': old_model.alpha,
'window': old_model.window,
'min_count': old_model.min_count,
'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
'sample': old_model.sample,
'sample': old_model.__dict__.get('sample', 1e-3),
'seed': old_model.seed,
'workers': old_model.workers,
'min_alpha': old_model.min_alpha,
'sg': old_model.sg,
'hs': old_model.hs,
'negative': old_model.negative,
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
'iter': old_model.iter,
'null_word': old_model.null_word,
'sorted_vocab': old_model.sorted_vocab,
'batch_words': old_model.batch_words,
'hashfxn': old_model.__dict__.get('hashfxn', hash),
'iter': old_model.__dict__.get('iter', 5),
'null_word': old_model.__dict__.get('null_word', 0),
'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
'compute_loss': old_model.__dict__.get('compute_loss', None)
}
new_model = NewWord2Vec(**params)
Expand All @@ -186,13 +187,13 @@ def load_old_word2vec(*args, **kwargs):
# set vocabulary attributes
new_model.wv.vocab = old_model.wv.vocab
new_model.wv.index2word = old_model.wv.index2word
new_model.vocabulary.cum_table = old_model.cum_table
new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None)

new_model.train_count = old_model.train_count
new_model.corpus_count = old_model.corpus_count
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', None)
new_model.total_train_time = old_model.total_train_time
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None)

return new_model
Expand Down
Binary file added gensim/test/test_data/d2v-lee-v0.13.0
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added gensim/test/test_data/w2v-lee-v0.12.0
Binary file not shown.
32 changes: 32 additions & 0 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,38 @@ def testLoadOldModel(self):

self.model_sanity(model)

# load really old model
model_file = 'd2v-lee-v0.13.0'
model = doc2vec.Doc2Vec.load(datapath(model_file))
self.model_sanity(model)

# Test loading doc2vec models from all previous versions
old_versions = [
'0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
'0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
'1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
'3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
]

saved_models_dir = datapath('old_d2v_models/d2v_{}.mdl')
for old_version in old_versions:
model = doc2vec.Doc2Vec.load(saved_models_dir.format(old_version))
self.assertTrue(len(model.wv.vocab) == 3)
self.assertTrue(model.wv.vectors.shape == (3, 4))
self.assertTrue(model.docvecs.vectors_docs.shape == (2, 4))
self.assertTrue(model.docvecs.count == 2)
# check if inferring vectors for new documents and similarity search works.
doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words)
sims_to_infer = model.docvecs.most_similar([doc0_inferred], topn=len(model.docvecs))
self.assertTrue(sims_to_infer)
# check if inferring vectors and similarity search works after saving and loading back the model
tmpf = get_tmpfile('gensim_doc2vec.tst')
model.save(tmpf)
loaded_model = doc2vec.Doc2Vec.load(tmpf)
doc0_inferred = loaded_model.infer_vector(list(DocsLeeCorpus())[0].words)
sims_to_infer = loaded_model.docvecs.most_similar([doc0_inferred], topn=len(loaded_model.docvecs))
self.assertTrue(sims_to_infer)

def test_unicode_in_doctag(self):
"""Test storing document vectors of a model with unicode titles."""
model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1)
Expand Down
37 changes: 35 additions & 2 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,15 @@ def testOnlineLearningAfterSave(self):
model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
self.assertEqual(len(model_neg.wv.vocab), 14)

def onlineSanity(self, model):
def onlineSanity(self, model, trained_model=False):
terro, others = [], []
for l in list_corpus:
if 'terrorism' in l:
terro.append(l)
else:
others.append(l)
self.assertTrue(all(['terrorism' not in l for l in others]))
model.build_vocab(others)
model.build_vocab(others, update=trained_model)
model.train(others, total_examples=model.corpus_count, epochs=model.iter)
self.assertFalse('terrorism' in model.wv.vocab)
model.build_vocab(terro, update=True)
Expand Down Expand Up @@ -764,6 +764,8 @@ def testLoadOldModel(self):
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
self.assertTrue(model.vocabulary.cum_table.shape == (12,))

self.onlineSanity(model, trained_model=True)

# Model stored in multiple files
model_file = 'word2vec_old_sep'
model = word2vec.Word2Vec.load(datapath(model_file))
Expand All @@ -774,12 +776,43 @@ def testLoadOldModel(self):
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
self.assertTrue(model.vocabulary.cum_table.shape == (12,))

self.onlineSanity(model, trained_model=True)

# load really old model
model_file = 'w2v-lee-v0.12.0'
model = word2vec.Word2Vec.load(datapath(model_file))
self.onlineSanity(model, trained_model=True)

# test for max_final_vocab for model saved in 3.3
model_file = 'word2vec_3.3'
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertEqual(model.max_final_vocab, None)
self.assertEqual(model.vocabulary.max_final_vocab, None)

# Test loading word2vec models from all previous versions
old_versions = [
'0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
'0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
'1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
'3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
]

saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl')
for old_version in old_versions:
model = word2vec.Word2Vec.load(saved_models_dir.format(old_version))
self.assertTrue(len(model.wv.vocab) == 3)
self.assertTrue(model.wv.vectors.shape == (3, 4))
# check if similarity search and online training works.
self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
model.build_vocab(list_corpus, update=True)
model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
# check if similarity search and online training works after saving and loading back the model.
tmpf = get_tmpfile('gensim_word2vec.tst')
model.save(tmpf)
loaded_model = word2vec.Word2Vec.load(tmpf)
loaded_model.build_vocab(list_corpus, update=True)
loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)

@log_capture()
def testBuildVocabWarning(self, l):
"""Test if warning is raised on non-ideal input to a word2vec model"""
Expand Down