Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes issues while loading word2vec and doc2vec models saved using old Gensim versions. Fix #2000, #1977 #2012

Merged
merged 13 commits into from
Apr 12, 2018
Merged
30 changes: 18 additions & 12 deletions gensim/models/deprecated/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@

from gensim import utils
from gensim.utils import call_on_class_only, deprecated
from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg
from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg,\
MAX_WORDS_IN_BATCH
from gensim.models.deprecated.keyedvectors import KeyedVectors
from gensim.models.doc2vec import Doc2Vec as NewDoc2Vec
from gensim.models.deprecated.old_saveload import SaveLoad
Expand Down Expand Up @@ -104,8 +105,8 @@ def load_old_doc2vec(*args, **kwargs):
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
'iter': old_model.iter,
'sorted_vocab': old_model.sorted_vocab,
'batch_words': old_model.batch_words,
'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
'compute_loss': old_model.__dict__.get('compute_loss', None)
}
new_model = NewDoc2Vec(**params)
Expand Down Expand Up @@ -136,16 +137,21 @@ def load_old_doc2vec(*args, **kwargs):

# set doc2vec vocabulary attributes
new_model.docvecs.doctags = old_model.docvecs.doctags
new_model.docvecs.max_rawint = old_model.docvecs.max_rawint
new_model.docvecs.offset2doctag = old_model.docvecs.offset2doctag
new_model.docvecs.count = old_model.docvecs.count

new_model.train_count = old_model.train_count
new_model.corpus_count = old_model.corpus_count
new_model.running_training_loss = old_model.running_training_loss
new_model.total_train_time = old_model.total_train_time
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
new_model.model_trimmed_post_training = old_model.model_trimmed_post_training
if hasattr(old_model.docvecs, 'max_rawint'): # `doc2vec` models before `0.12.3` do not have these 2 attributes
new_model.docvecs.max_rawint = old_model.docvecs.__dict__.get('max_rawint')
new_model.docvecs.offset2doctag = old_model.docvecs.__dict__.get('offset2doctag')
else:
new_model.docvecs.max_rawint = \
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Magic: definitely deserves a comment.

len(old_model.docvecs.index2doctag) - 1 if old_model.docvecs.index2doctag else old_model.docvecs.count - 1
new_model.docvecs.offset2doctag = old_model.docvecs.index2doctag

new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None)

return new_model

Expand Down
27 changes: 14 additions & 13 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,25 +151,26 @@

def load_old_word2vec(*args, **kwargs):
old_model = Word2Vec.load(*args, **kwargs)
vector_size = getattr(old_model, 'vector_size', old_model.layer1_size)
params = {
'size': old_model.vector_size,
'size': vector_size,
'alpha': old_model.alpha,
'window': old_model.window,
'min_count': old_model.min_count,
'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
'sample': old_model.sample,
'sample': old_model.__dict__.get('sample', 1e-3),
'seed': old_model.seed,
'workers': old_model.workers,
'min_alpha': old_model.min_alpha,
'sg': old_model.sg,
'hs': old_model.hs,
'negative': old_model.negative,
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
'iter': old_model.iter,
'null_word': old_model.null_word,
'sorted_vocab': old_model.sorted_vocab,
'batch_words': old_model.batch_words,
'hashfxn': old_model.__dict__.get('hashfxn', hash),
'iter': old_model.__dict__.get('iter', 5),
'null_word': old_model.__dict__.get('null_word', 0),
'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
'compute_loss': old_model.__dict__.get('compute_loss', None)
}
new_model = NewWord2Vec(**params)
Expand All @@ -186,13 +187,13 @@ def load_old_word2vec(*args, **kwargs):
# set vocabulary attributes
new_model.wv.vocab = old_model.wv.vocab
new_model.wv.index2word = old_model.wv.index2word
new_model.vocabulary.cum_table = old_model.cum_table
new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None)

new_model.train_count = old_model.train_count
new_model.corpus_count = old_model.corpus_count
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', None)
new_model.total_train_time = old_model.total_train_time
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
new_model.train_count = old_model.__dict__.get('train_count', None)
new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None)

return new_model
Expand Down
Binary file added gensim/test/test_data/d2v-lee-v0.13.0
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added gensim/test/test_data/w2v-lee-v0.12.0
Binary file not shown.
25 changes: 25 additions & 0 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,31 @@ def testLoadOldModel(self):

self.model_sanity(model)

# load really old model
model_file = 'd2v-lee-v0.13.0'
model = doc2vec.Doc2Vec.load(datapath(model_file))
self.model_sanity(model)

# Test loading doc2vec models from all previous versions
old_versions = [
'0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
'0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
'1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
'3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
]

saved_models_dir = datapath('old_d2v_models')
Copy link
Contributor

@menshikh-iv menshikh-iv Apr 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better datapath('old_d2v_models/d2v_{}.mdl') and format later

for old_version in old_versions:
model = doc2vec.Doc2Vec.load(os.path.join(saved_models_dir, 'd2v_{}.mdl'.format(old_version)))
self.assertTrue(len(model.wv.vocab) == 3)
self.assertTrue(model.wv.vectors.shape == (3, 4))
self.assertTrue(model.docvecs.vectors_docs.shape == (2, 4))
self.assertTrue(model.docvecs.count == 2)
# check if inferring vectors for new documents and similarity search works.
doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words)
sims_to_infer = model.docvecs.most_similar([doc0_inferred], topn=len(model.docvecs))
self.assertTrue(sims_to_infer)

Copy link
Contributor

@menshikh-iv menshikh-iv Apr 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add here save+load+infer_vector here (to be 100% sure that this persistent correctly)? Make sure that you used /tmp directory, check gensim.test.utils, you'll found needed functions (and same for w2v).

Also, please try to update model (as for w2v)

def test_unicode_in_doctag(self):
"""Test storing document vectors of a model with unicode titles."""
model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1)
Expand Down
31 changes: 29 additions & 2 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,15 @@ def testOnlineLearningAfterSave(self):
model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
self.assertEqual(len(model_neg.wv.vocab), 14)

def onlineSanity(self, model):
def onlineSanity(self, model, trained_model=False):
terro, others = [], []
for l in list_corpus:
if 'terrorism' in l:
terro.append(l)
else:
others.append(l)
self.assertTrue(all(['terrorism' not in l for l in others]))
model.build_vocab(others)
model.build_vocab(others, update=trained_model)
model.train(others, total_examples=model.corpus_count, epochs=model.iter)
self.assertFalse('terrorism' in model.wv.vocab)
model.build_vocab(terro, update=True)
Expand Down Expand Up @@ -764,6 +764,8 @@ def testLoadOldModel(self):
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
self.assertTrue(model.vocabulary.cum_table.shape == (12,))

self.onlineSanity(model, trained_model=True)

# Model stored in multiple files
model_file = 'word2vec_old_sep'
model = word2vec.Word2Vec.load(datapath(model_file))
Expand All @@ -774,12 +776,37 @@ def testLoadOldModel(self):
self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
self.assertTrue(model.vocabulary.cum_table.shape == (12,))

self.onlineSanity(model, trained_model=True)

# load really old model
model_file = 'w2v-lee-v0.12.0'
model = word2vec.Word2Vec.load(datapath(model_file))
self.onlineSanity(model, trained_model=True)

# test for max_final_vocab for model saved in 3.3
model_file = 'word2vec_3.3'
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertEqual(model.max_final_vocab, None)
self.assertEqual(model.vocabulary.max_final_vocab, None)

# Test loading word2vec models from all previous versions
old_versions = [
'0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
'0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
'1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
'3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
]

saved_models_dir = datapath('old_w2v_models')
for old_version in old_versions:
model = word2vec.Word2Vec.load(os.path.join(saved_models_dir, 'w2v_{}.mdl'.format(old_version)))
self.assertTrue(len(model.wv.vocab) == 3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add most_similar + update an model (similar for d2v)

self.assertTrue(model.wv.vectors.shape == (3, 4))
# check if similarity search and online training works.
self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
model.build_vocab(list_corpus, update=True)
model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)

@log_capture()
def testBuildVocabWarning(self, l):
"""Test if warning is raised on non-ideal input to a word2vec model"""
Expand Down