diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py index fc7535d69b..33d442904a 100644 --- a/gensim/models/deprecated/doc2vec.py +++ b/gensim/models/deprecated/doc2vec.py @@ -68,7 +68,8 @@ from gensim import utils from gensim.utils import call_on_class_only, deprecated -from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg +from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg,\ + MAX_WORDS_IN_BATCH from gensim.models.deprecated.keyedvectors import KeyedVectors from gensim.models.doc2vec import Doc2Vec as NewDoc2Vec from gensim.models.deprecated.old_saveload import SaveLoad @@ -104,8 +105,8 @@ def load_old_doc2vec(*args, **kwargs): 'cbow_mean': old_model.cbow_mean, 'hashfxn': old_model.hashfxn, 'iter': old_model.iter, - 'sorted_vocab': old_model.sorted_vocab, - 'batch_words': old_model.batch_words, + 'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1), + 'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH), 'compute_loss': old_model.__dict__.get('compute_loss', None) } new_model = NewDoc2Vec(**params) @@ -136,16 +137,26 @@ def load_old_doc2vec(*args, **kwargs): # set doc2vec vocabulary attributes new_model.docvecs.doctags = old_model.docvecs.doctags - new_model.docvecs.max_rawint = old_model.docvecs.max_rawint - new_model.docvecs.offset2doctag = old_model.docvecs.offset2doctag new_model.docvecs.count = old_model.docvecs.count - - new_model.train_count = old_model.train_count - new_model.corpus_count = old_model.corpus_count - new_model.running_training_loss = old_model.running_training_loss - new_model.total_train_time = old_model.total_train_time - new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached - new_model.model_trimmed_post_training = old_model.model_trimmed_post_training + if hasattr(old_model.docvecs, 'max_rawint'): # `doc2vec` models before `0.12.3` do not have these 2 attributes + new_model.docvecs.max_rawint = old_model.docvecs.__dict__.get('max_rawint') + new_model.docvecs.offset2doctag = old_model.docvecs.__dict__.get('offset2doctag') + else: + # Doc2Vec models before Gensim version 0.12.3 did not have `max_rawint` and `offset2doctag` as they did not + # mixing of string and int tags. This implies the new attribute `offset2doctag` equals the old `index2doctag` + # (which was only filled if the documents had string tags). + # This also implies that the new attribute, `max_rawint`(highest rawint-indexed doctag) would either be equal + # to the initial value -1, in case only string tags are used or would be equal to `count` if only int indexing + # was used. + new_model.docvecs.max_rawint = -1 if old_model.docvecs.index2doctag else old_model.docvecs.count - 1 + new_model.docvecs.offset2doctag = old_model.docvecs.index2doctag + + new_model.train_count = old_model.__dict__.get('train_count', None) + new_model.corpus_count = old_model.__dict__.get('corpus_count', None) + new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0) + new_model.total_train_time = old_model.__dict__.get('total_train_time', None) + new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha) + new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None) return new_model diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py index f5b5d87254..5ac913dbb9 100644 --- a/gensim/models/deprecated/word2vec.py +++ b/gensim/models/deprecated/word2vec.py @@ -151,13 +151,14 @@ def load_old_word2vec(*args, **kwargs): old_model = Word2Vec.load(*args, **kwargs) + vector_size = getattr(old_model, 'vector_size', old_model.layer1_size) params = { - 'size': old_model.vector_size, + 'size': vector_size, 'alpha': old_model.alpha, 'window': old_model.window, 'min_count': old_model.min_count, 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.sample, + 'sample': old_model.__dict__.get('sample', 1e-3), 'seed': old_model.seed, 'workers': old_model.workers, 'min_alpha': old_model.min_alpha, @@ -165,11 +166,11 @@ def load_old_word2vec(*args, **kwargs): 'hs': old_model.hs, 'negative': old_model.negative, 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.hashfxn, - 'iter': old_model.iter, - 'null_word': old_model.null_word, - 'sorted_vocab': old_model.sorted_vocab, - 'batch_words': old_model.batch_words, + 'hashfxn': old_model.__dict__.get('hashfxn', hash), + 'iter': old_model.__dict__.get('iter', 5), + 'null_word': old_model.__dict__.get('null_word', 0), + 'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1), + 'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH), 'compute_loss': old_model.__dict__.get('compute_loss', None) } new_model = NewWord2Vec(**params) @@ -186,13 +187,13 @@ def load_old_word2vec(*args, **kwargs): # set vocabulary attributes new_model.wv.vocab = old_model.wv.vocab new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.cum_table + new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None) - new_model.train_count = old_model.train_count - new_model.corpus_count = old_model.corpus_count - new_model.running_training_loss = old_model.__dict__.get('running_training_loss', None) - new_model.total_train_time = old_model.total_train_time - new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached + new_model.train_count = old_model.__dict__.get('train_count', None) + new_model.corpus_count = old_model.__dict__.get('corpus_count', None) + new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0) + new_model.total_train_time = old_model.__dict__.get('total_train_time', None) + new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha) new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None) return new_model diff --git a/gensim/test/test_data/d2v-lee-v0.13.0 b/gensim/test/test_data/d2v-lee-v0.13.0 new file mode 100644 index 0000000000..088163624d Binary files /dev/null and b/gensim/test/test_data/d2v-lee-v0.13.0 differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.0.mdl new file mode 100644 index 0000000000..293633fddd Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.12.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.1.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.1.mdl new file mode 100644 index 0000000000..5bde01ee9c Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.12.1.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.2.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.2.mdl new file mode 100644 index 0000000000..6eaf051b6f Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.12.2.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.3.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.3.mdl new file mode 100644 index 0000000000..514b2f0692 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.12.3.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.12.4.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.12.4.mdl new file mode 100644 index 0000000000..760c6f7092 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.12.4.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.0.mdl new file mode 100644 index 0000000000..2639f20546 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.13.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.1.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.1.mdl new file mode 100644 index 0000000000..262e691123 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.13.1.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.2.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.2.mdl new file mode 100644 index 0000000000..0c1ceaca55 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.13.2.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.3.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.3.mdl new file mode 100644 index 0000000000..6eb5c9031f Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.13.3.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_0.13.4.mdl b/gensim/test/test_data/old_d2v_models/d2v_0.13.4.mdl new file mode 100644 index 0000000000..425a6564b3 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_0.13.4.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_1.0.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_1.0.0.mdl new file mode 100644 index 0000000000..c487e04d63 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_1.0.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_1.0.1.mdl b/gensim/test/test_data/old_d2v_models/d2v_1.0.1.mdl new file mode 100644 index 0000000000..92f82a18fc Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_1.0.1.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.0.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.0.0.mdl new file mode 100644 index 0000000000..1ed1725a08 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_2.0.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.1.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.1.0.mdl new file mode 100644 index 0000000000..29d65b3ad1 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_2.1.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.2.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.2.0.mdl new file mode 100644 index 0000000000..8b1da4c876 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_2.2.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_2.3.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_2.3.0.mdl new file mode 100644 index 0000000000..319b5089a6 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_2.3.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.0.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.0.0.mdl new file mode 100644 index 0000000000..49f7b9aeca Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_3.0.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.1.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.1.0.mdl new file mode 100644 index 0000000000..4616b0fe71 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_3.1.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.2.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.2.0.mdl new file mode 100644 index 0000000000..e58ea456a3 Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_3.2.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.3.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.3.0.mdl new file mode 100644 index 0000000000..4c875a4a5b Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_3.3.0.mdl differ diff --git a/gensim/test/test_data/old_d2v_models/d2v_3.4.0.mdl b/gensim/test/test_data/old_d2v_models/d2v_3.4.0.mdl new file mode 100644 index 0000000000..0d69524eda Binary files /dev/null and b/gensim/test/test_data/old_d2v_models/d2v_3.4.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.0.mdl new file mode 100644 index 0000000000..2cd37d590f Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.12.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.1.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.1.mdl new file mode 100644 index 0000000000..7336609d9d Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.12.1.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.2.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.2.mdl new file mode 100644 index 0000000000..52df9cc4fd Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.12.2.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.3.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.3.mdl new file mode 100644 index 0000000000..906d2fb928 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.12.3.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.12.4.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.12.4.mdl new file mode 100644 index 0000000000..8d9df5cf50 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.12.4.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.0.mdl new file mode 100644 index 0000000000..cb457c07c8 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.13.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.1.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.1.mdl new file mode 100644 index 0000000000..6db545c65a Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.13.1.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.2.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.2.mdl new file mode 100644 index 0000000000..97c83753b8 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.13.2.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.3.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.3.mdl new file mode 100644 index 0000000000..b382d6a91b Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.13.3.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_0.13.4.mdl b/gensim/test/test_data/old_w2v_models/w2v_0.13.4.mdl new file mode 100644 index 0000000000..7e57e7eeb7 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_0.13.4.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_1.0.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_1.0.0.mdl new file mode 100644 index 0000000000..2297e57b28 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_1.0.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_1.0.1.mdl b/gensim/test/test_data/old_w2v_models/w2v_1.0.1.mdl new file mode 100644 index 0000000000..1791d41175 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_1.0.1.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.0.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.0.0.mdl new file mode 100644 index 0000000000..c6e12d4b4b Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_2.0.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.1.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.1.0.mdl new file mode 100644 index 0000000000..d011657b52 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_2.1.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.2.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.2.0.mdl new file mode 100644 index 0000000000..df04023bb1 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_2.2.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_2.3.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_2.3.0.mdl new file mode 100644 index 0000000000..acf4b78c0c Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_2.3.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.0.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.0.0.mdl new file mode 100644 index 0000000000..19f1328ef4 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_3.0.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.1.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.1.0.mdl new file mode 100644 index 0000000000..d39e060c5f Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_3.1.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.2.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.2.0.mdl new file mode 100644 index 0000000000..8662646936 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_3.2.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.3.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.3.0.mdl new file mode 100644 index 0000000000..3ba4eba086 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_3.3.0.mdl differ diff --git a/gensim/test/test_data/old_w2v_models/w2v_3.4.0.mdl b/gensim/test/test_data/old_w2v_models/w2v_3.4.0.mdl new file mode 100644 index 0000000000..55d4627d53 Binary files /dev/null and b/gensim/test/test_data/old_w2v_models/w2v_3.4.0.mdl differ diff --git a/gensim/test/test_data/w2v-lee-v0.12.0 b/gensim/test/test_data/w2v-lee-v0.12.0 new file mode 100644 index 0000000000..6f8e50a247 Binary files /dev/null and b/gensim/test/test_data/w2v-lee-v0.12.0 differ diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index a41694e0dd..559e166d4f 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -122,6 +122,38 @@ def testLoadOldModel(self): self.model_sanity(model) + # load really old model + model_file = 'd2v-lee-v0.13.0' + model = doc2vec.Doc2Vec.load(datapath(model_file)) + self.model_sanity(model) + + # Test loading doc2vec models from all previous versions + old_versions = [ + '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', + '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', + '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', + '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' + ] + + saved_models_dir = datapath('old_d2v_models/d2v_{}.mdl') + for old_version in old_versions: + model = doc2vec.Doc2Vec.load(saved_models_dir.format(old_version)) + self.assertTrue(len(model.wv.vocab) == 3) + self.assertTrue(model.wv.vectors.shape == (3, 4)) + self.assertTrue(model.docvecs.vectors_docs.shape == (2, 4)) + self.assertTrue(model.docvecs.count == 2) + # check if inferring vectors for new documents and similarity search works. + doc0_inferred = model.infer_vector(list(DocsLeeCorpus())[0].words) + sims_to_infer = model.docvecs.most_similar([doc0_inferred], topn=len(model.docvecs)) + self.assertTrue(sims_to_infer) + # check if inferring vectors and similarity search works after saving and loading back the model + tmpf = get_tmpfile('gensim_doc2vec.tst') + model.save(tmpf) + loaded_model = doc2vec.Doc2Vec.load(tmpf) + doc0_inferred = loaded_model.infer_vector(list(DocsLeeCorpus())[0].words) + sims_to_infer = loaded_model.docvecs.most_similar([doc0_inferred], topn=len(loaded_model.docvecs)) + self.assertTrue(sims_to_infer) + def test_unicode_in_doctag(self): """Test storing document vectors of a model with unicode titles.""" model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 9641a332a1..411d200676 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -192,7 +192,7 @@ def testOnlineLearningAfterSave(self): model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) - def onlineSanity(self, model): + def onlineSanity(self, model, trained_model=False): terro, others = [], [] for l in list_corpus: if 'terrorism' in l: @@ -200,7 +200,7 @@ def onlineSanity(self, model): else: others.append(l) self.assertTrue(all(['terrorism' not in l for l in others])) - model.build_vocab(others) + model.build_vocab(others, update=trained_model) model.train(others, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse('terrorism' in model.wv.vocab) model.build_vocab(terro, update=True) @@ -764,6 +764,8 @@ def testLoadOldModel(self): self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) + self.onlineSanity(model, trained_model=True) + # Model stored in multiple files model_file = 'word2vec_old_sep' model = word2vec.Word2Vec.load(datapath(model_file)) @@ -774,12 +776,43 @@ def testLoadOldModel(self): self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) + self.onlineSanity(model, trained_model=True) + + # load really old model + model_file = 'w2v-lee-v0.12.0' + model = word2vec.Word2Vec.load(datapath(model_file)) + self.onlineSanity(model, trained_model=True) + # test for max_final_vocab for model saved in 3.3 model_file = 'word2vec_3.3' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertEqual(model.max_final_vocab, None) self.assertEqual(model.vocabulary.max_final_vocab, None) + # Test loading word2vec models from all previous versions + old_versions = [ + '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', + '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', + '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', + '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' + ] + + saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl') + for old_version in old_versions: + model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) + self.assertTrue(len(model.wv.vocab) == 3) + self.assertTrue(model.wv.vectors.shape == (3, 4)) + # check if similarity search and online training works. + self.assertTrue(len(model.wv.most_similar('sentence')) == 2) + model.build_vocab(list_corpus, update=True) + model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) + # check if similarity search and online training works after saving and loading back the model. + tmpf = get_tmpfile('gensim_word2vec.tst') + model.save(tmpf) + loaded_model = word2vec.Word2Vec.load(tmpf) + loaded_model.build_vocab(list_corpus, update=True) + loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) + @log_capture() def testBuildVocabWarning(self, l): """Test if warning is raised on non-ideal input to a word2vec model"""