piskvorky · menshikh-iv · Apr 12, 2018 · Mar 31, 2018 · Mar 31, 2018 · Mar 31, 2018
diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py
@@ -68,7 +68,8 @@
 
 from gensim import utils
 from gensim.utils import call_on_class_only, deprecated
-from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg
+from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg,\
+    MAX_WORDS_IN_BATCH
 from gensim.models.deprecated.keyedvectors import KeyedVectors
 from gensim.models.doc2vec import Doc2Vec as NewDoc2Vec
 from gensim.models.deprecated.old_saveload import SaveLoad
@@ -104,8 +105,8 @@ def load_old_doc2vec(*args, **kwargs):
         'cbow_mean': old_model.cbow_mean,
         'hashfxn': old_model.hashfxn,
         'iter': old_model.iter,
-        'sorted_vocab': old_model.sorted_vocab,
-        'batch_words': old_model.batch_words,
+        'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
+        'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
         'compute_loss': old_model.__dict__.get('compute_loss', None)
     }
     new_model = NewDoc2Vec(**params)
@@ -136,16 +137,26 @@ def load_old_doc2vec(*args, **kwargs):
 
     # set doc2vec vocabulary attributes
     new_model.docvecs.doctags = old_model.docvecs.doctags
-    new_model.docvecs.max_rawint = old_model.docvecs.max_rawint
-    new_model.docvecs.offset2doctag = old_model.docvecs.offset2doctag
     new_model.docvecs.count = old_model.docvecs.count
-
-    new_model.train_count = old_model.train_count
-    new_model.corpus_count = old_model.corpus_count
-    new_model.running_training_loss = old_model.running_training_loss
-    new_model.total_train_time = old_model.total_train_time
-    new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
-    new_model.model_trimmed_post_training = old_model.model_trimmed_post_training
+    if hasattr(old_model.docvecs, 'max_rawint'):  # `doc2vec` models before `0.12.3` do not have these 2 attributes
+        new_model.docvecs.max_rawint = old_model.docvecs.__dict__.get('max_rawint')
+        new_model.docvecs.offset2doctag = old_model.docvecs.__dict__.get('offset2doctag')
+    else:
+        # Doc2Vec models before Gensim version 0.12.3 did not have `max_rawint` and `offset2doctag` as they did not
+        # mixing of string and int tags. This implies the new attribute `offset2doctag` equals the old `index2doctag`
+        # (which was only filled if the documents had string tags).
+        # This also implies that the new attribute, `max_rawint`(highest rawint-indexed doctag) would either be equal
+        # to the initial value -1, in case only string tags are used or would be equal to `count` if only int indexing
+        # was used.
+        new_model.docvecs.max_rawint = -1 if old_model.docvecs.index2doctag else old_model.docvecs.count - 1
+        new_model.docvecs.offset2doctag = old_model.docvecs.index2doctag
+
+    new_model.train_count = old_model.__dict__.get('train_count', None)
+    new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
+    new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
+    new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
+    new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
+    new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None)
 
     return new_model
 

diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py
@@ -151,25 +151,26 @@
 
 def load_old_word2vec(*args, **kwargs):
     old_model = Word2Vec.load(*args, **kwargs)
+    vector_size = getattr(old_model, 'vector_size', old_model.layer1_size)
     params = {
-        'size': old_model.vector_size,
+        'size': vector_size,
         'alpha': old_model.alpha,
         'window': old_model.window,
         'min_count': old_model.min_count,
         'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
-        'sample': old_model.sample,
+        'sample': old_model.__dict__.get('sample', 1e-3),
         'seed': old_model.seed,
         'workers': old_model.workers,
         'min_alpha': old_model.min_alpha,
         'sg': old_model.sg,
         'hs': old_model.hs,
         'negative': old_model.negative,
         'cbow_mean': old_model.cbow_mean,
-        'hashfxn': old_model.hashfxn,
-        'iter': old_model.iter,
-        'null_word': old_model.null_word,
-        'sorted_vocab': old_model.sorted_vocab,
-        'batch_words': old_model.batch_words,
+        'hashfxn': old_model.__dict__.get('hashfxn', hash),
+        'iter': old_model.__dict__.get('iter', 5),
+        'null_word': old_model.__dict__.get('null_word', 0),
+        'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
+        'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
         'compute_loss': old_model.__dict__.get('compute_loss', None)
     }
     new_model = NewWord2Vec(**params)
@@ -186,13 +187,13 @@ def load_old_word2vec(*args, **kwargs):
     # set vocabulary attributes
     new_model.wv.vocab = old_model.wv.vocab
     new_model.wv.index2word = old_model.wv.index2word
-    new_model.vocabulary.cum_table = old_model.cum_table
+    new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None)
 
-    new_model.train_count = old_model.train_count
-    new_model.corpus_count = old_model.corpus_count
-    new_model.running_training_loss = old_model.__dict__.get('running_training_loss', None)
-    new_model.total_train_time = old_model.total_train_time
-    new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
+    new_model.train_count = old_model.__dict__.get('train_count', None)
+    new_model.corpus_count = old_model.__dict__.get('corpus_count', None)
+    new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0)
+    new_model.total_train_time = old_model.__dict__.get('total_train_time', None)
+    new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha)
     new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None)
 
     return new_model

diff --git a/gensim/test/test_data/d2v-lee-v0.13.0 b/gensim/test/test_data/d2v-lee-v0.13.0
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.1.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.1.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.2.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.2.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.3.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.3.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.4.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.4.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.1.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.1.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.2.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.2.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.3.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.3.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.4.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.4.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_1.0.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_1.0.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_1.0.1.mdl b/gensim/test/test_data/old_d2v_models/d2v_1.0.1.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.0.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.0.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.1.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.1.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.2.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.2.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.3.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.3.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.0.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.0.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.1.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.1.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.2.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.2.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.3.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.3.0.mdl
diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.4.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.4.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.1.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.1.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.2.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.2.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.3.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.3.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.4.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.4.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.1.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.1.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.2.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.2.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.3.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.3.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.4.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.4.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_1.0.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_1.0.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_1.0.1.mdl b/gensim/test/test_data/old_w2v_models/w2v_1.0.1.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.0.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.0.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.1.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.1.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.2.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.2.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.3.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.3.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.0.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.0.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.1.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.1.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.2.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.2.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.3.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.3.0.mdl
diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.4.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.4.0.mdl
diff --git a/gensim/test/test_data/w2v-lee-v0.12.0 b/gensim/test/test_data/w2v-lee-v0.12.0
diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -122,6 +122,38 @@ def testLoadOldModel(self):
 
         self.model_sanity(model)
 
+        # load really old model
+        model_file = 'd2v-lee-v0.13.0'
+        model = doc2vec.Doc2Vec.load(datapath(model_file))
+        self.model_sanity(model)
+
+        # Test loading doc2vec models from all previous versions
+        old_versions = [
+            '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
+            '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
+            '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
+            '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
+        ]
+
+        saved_models_dir = datapath('old_d2v_models/d2v_{}.mdl')
+        for old_version in old_versions:
+            model = doc2vec.Doc2Vec.load(saved_models_dir.format(old_version))
+            self.assertTrue(len(model.wv.vocab) == 3)
+            self.assertTrue(model.wv.vectors.shape == (3, 4))
+            self.assertTrue(model.docvecs.vectors_docs.shape == (2, 4))
+            self.assertTrue(model.docvecs.count == 2)
+            # check if inferring vectors for new documents and similarity search works.
+            doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words)
+            sims_to_infer = model.docvecs.most_similar([doc0_inferred], topn=len(model.docvecs))
+            self.assertTrue(sims_to_infer)
+            # check if inferring vectors and similarity search works after saving and loading back the model
+            tmpf = get_tmpfile('gensim_doc2vec.tst')
+            model.save(tmpf)
+            loaded_model = doc2vec.Doc2Vec.load(tmpf)
+            doc0_inferred = loaded_model.infer_vector(list(DocsLeeCorpus())[0].words)
+            sims_to_infer = loaded_model.docvecs.most_similar([doc0_inferred], topn=len(loaded_model.docvecs))
+            self.assertTrue(sims_to_infer)
+
     def test_unicode_in_doctag(self):
         """Test storing document vectors of a model with unicode titles."""
         model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1)

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -192,15 +192,15 @@ def testOnlineLearningAfterSave(self):
         model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
         self.assertEqual(len(model_neg.wv.vocab), 14)
 
-    def onlineSanity(self, model):
+    def onlineSanity(self, model, trained_model=False):
         terro, others = [], []
         for l in list_corpus:
             if 'terrorism' in l:
                 terro.append(l)
             else:
                 others.append(l)
         self.assertTrue(all(['terrorism' not in l for l in others]))
-        model.build_vocab(others)
+        model.build_vocab(others, update=trained_model)
         model.train(others, total_examples=model.corpus_count, epochs=model.iter)
         self.assertFalse('terrorism' in model.wv.vocab)
         model.build_vocab(terro, update=True)
@@ -764,6 +764,8 @@ def testLoadOldModel(self):
         self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
         self.assertTrue(model.vocabulary.cum_table.shape == (12,))
 
+        self.onlineSanity(model, trained_model=True)
+
         # Model stored in multiple files
         model_file = 'word2vec_old_sep'
         model = word2vec.Word2Vec.load(datapath(model_file))
@@ -774,12 +776,43 @@ def testLoadOldModel(self):
         self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
         self.assertTrue(model.vocabulary.cum_table.shape == (12,))
 
+        self.onlineSanity(model, trained_model=True)
+
+        # load really old model
+        model_file = 'w2v-lee-v0.12.0'
+        model = word2vec.Word2Vec.load(datapath(model_file))
+        self.onlineSanity(model, trained_model=True)
+
         # test for max_final_vocab for model saved in 3.3
         model_file = 'word2vec_3.3'
         model = word2vec.Word2Vec.load(datapath(model_file))
         self.assertEqual(model.max_final_vocab, None)
         self.assertEqual(model.vocabulary.max_final_vocab, None)
 
+        # Test loading word2vec models from all previous versions
+        old_versions = [
+            '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
+            '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
+            '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
+            '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
+        ]
+
+        saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl')
+        for old_version in old_versions:
+            model = word2vec.Word2Vec.load(saved_models_dir.format(old_version))
+            self.assertTrue(len(model.wv.vocab) == 3)
+            self.assertTrue(model.wv.vectors.shape == (3, 4))
+            # check if similarity search and online training works.
+            self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
+            model.build_vocab(list_corpus, update=True)
+            model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
+            # check if similarity search and online training works after saving and loading back the model.
+            tmpf = get_tmpfile('gensim_word2vec.tst')
+            model.save(tmpf)
+            loaded_model = word2vec.Word2Vec.load(tmpf)
+            loaded_model.build_vocab(list_corpus, update=True)
+            loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
+
     @log_capture()
     def testBuildVocabWarning(self, l):
         """Test if warning is raised on non-ideal input to a word2vec model"""