From 2e9d2a5d22cc2a7d9246adb18af7bac3a895f95e Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Mon, 31 Oct 2016 19:46:53 +0300 Subject: [PATCH 01/16] issue #446 add finished_training method --- gensim/models/doc2vec.py | 21 +++++++++++++++++---- gensim/models/word2vec.py | 11 +++++++++++ gensim/test/test_doc2vec.py | 8 ++++++++ gensim/test/test_word2vec.py | 16 +++++++++++++++- 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c9f39f3299..b14ec51c16 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -508,8 +508,8 @@ def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_a d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) return dot(matutils.unitvec(d1), matutils.unitvec(d2)) - - + + class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): """A string document tag discovered during the initial vocabulary scan. (The document-vector equivalent of a Vocab object.) @@ -553,7 +553,7 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, `alpha` is the initial learning rate (will linearly drop to zero as training progresses). - `seed` = for the random number generator. + `seed` = for the random number generator. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED @@ -570,7 +570,7 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, `workers` = use this many worker threads to train the model (=faster training with multicore machines). - `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, + `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, but values of 10 or 20 are common in published 'Paragraph Vector' experiments. `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). @@ -778,6 +778,19 @@ def __str__(self): segments.append('t%d' % self.workers) return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) + def finished_training(self): + """ + Discard parametrs that are used in training and score. Use if you're sure you're done training a model, + """ + self.training_finished = True + if hasattr(self, 'syn1') and not self.hs: + del self.syn1 + if hasattr(self, 'syn1neg') and not self.negative: + del self.syn1neg + if hasattr(self, 'doctag_syn0'): + del self.doctag_syn0 + if hasattr(self, 'doctag_syn0_lockf'): + del self.doctag_syn0_lockf class TaggedBrownCorpus(object): """Iterate over documents from the Brown corpus (part of NLTK data), yielding diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9bb50698c3..ac752ceac8 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1750,6 +1750,17 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c def __str__(self): return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha) + def finished_training(self): + """ + Discard parametrs that are used in training and score. Use if you're sure you're done training a model, + """ + self.training_finished = True + self.init_sims(replace = True) + if hasattr(self, 'syn1neg'): + del self.syn1neg + if hasattr(self, 'syn0_lockf'): + del self.syn0_lockf + def save(self, *args, **kwargs): # don't bother storing the cached normalized vectors, recalculable table kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table']) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 42264c0b4b..e204b74efa 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -280,6 +280,14 @@ def models_equal(self, model, model2): self.assertEqual(len(model.docvecs.offset2doctag), len(model2.docvecs.offset2doctag)) self.assertTrue(np.allclose(model.docvecs.doctag_syn0, model2.docvecs.doctag_syn0)) + def test_finished_training(self): + """Test doc2vec model after finishing training""" + for i in [0, 1]: + for j in [0, 1]: + model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, negative=i, hs=j) + model.finished_training() + self.assertTrue(len(model.infer_vector(['graph'])), 5) + @log_capture() def testBuildVocabWarning(self, l): """Test if logger warning is raised on non-ideal input to a doc2vec model""" diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index a2d7e6c743..34e7decee4 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -434,7 +434,7 @@ def testSimilarities(self): model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences) - + self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) self.assertRaises(ZeroDivisionError, model.n_similarity, ['graph', 'trees'], []) @@ -482,6 +482,20 @@ def models_equal(self, model, model2): most_common_word = max(model.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(numpy.allclose(model[most_common_word], model2[most_common_word])) + def testFinishedTraining(self): + """Test word2vec model after finishing training""" + for i in [0, 1]: + for j in [0, 1]: + model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) + model.finished_training() + self.assertTrue(len(model.vocab), 12) + self.assertTrue(model.vocab['graph'].count, 3) + model = word2vec.Word2Vec(sentences, min_count=1) + model.save_word2vec_format(testfile(), binary=True) + norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) + norm_only_model.finished_training() + self.assertFalse(numpy.allclose(model['human'], norm_only_model['human'])) + @log_capture() def testBuildVocabWarning(self, l): """Test if warning is raised on non-ideal input to a word2vec model""" From a2efb8c0b940436d2a21331c8b6b169b7a17407c Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Tue, 1 Nov 2016 18:37:50 +0300 Subject: [PATCH 02/16] private _minimize_model, tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can't just call «the super method in word2vec explicitly» without adding the flag to save syn0_lockf, which as is necessary to save in d2v. --- gensim/models/doc2vec.py | 9 +++------ gensim/models/word2vec.py | 19 +++++++++++++------ gensim/test/test_doc2vec.py | 14 ++++++++++++-- gensim/test/test_word2vec.py | 6 +++++- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index b14ec51c16..db83deb80f 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -392,6 +392,7 @@ def init_sims(self, replace=False): etc., but not `train` or `infer_vector`. """ + print ('HELLO DOC!!!') if getattr(self, 'doctag_syn0norm', None) is None or replace: logger.info("precomputing L2-norms of doc weight vectors") if replace: @@ -780,13 +781,9 @@ def __str__(self): def finished_training(self): """ - Discard parametrs that are used in training and score. Use if you're sure you're done training a model, + Discard parametrs that are used in training and score. Use if you're sure you're done training a model. """ - self.training_finished = True - if hasattr(self, 'syn1') and not self.hs: - del self.syn1 - if hasattr(self, 'syn1neg') and not self.negative: - del self.syn1neg + self._minimize_model(self.hs, self.negative > 0, True) if hasattr(self, 'doctag_syn0'): del self.doctag_syn0 if hasattr(self, 'doctag_syn0_lockf'): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ac752ceac8..86dce2f43c 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1750,16 +1750,23 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c def __str__(self): return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha) + def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_lockf = False): + if hasattr(self, 'syn1') and not save_syn1: + del self.syn1 + if hasattr(self, 'syn1neg') and not save_syn1neg: + del self.syn1neg + if hasattr(self, 'syn0_lockf') and not save_syn0_lockf: + del self.syn0_lockf + def finished_training(self): """ - Discard parametrs that are used in training and score. Use if you're sure you're done training a model, + Discard parametrs that are used in training and score. Use if you're sure you're done training a model. """ self.training_finished = True - self.init_sims(replace = True) - if hasattr(self, 'syn1neg'): - del self.syn1neg - if hasattr(self, 'syn0_lockf'): - del self.syn0_lockf + for i in xrange(self.syn0.shape[0]): + self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) + self.syn0norm = self.syn0 + self._minimize_model() def save(self, *args, **kwargs): # don't bother storing the cached normalized vectors, recalculable table diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index e204b74efa..24f4d588b3 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -284,9 +284,19 @@ def test_finished_training(self): """Test doc2vec model after finishing training""" for i in [0, 1]: for j in [0, 1]: - model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, negative=i, hs=j) + model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, hs=i, negative=j) model.finished_training() - self.assertTrue(len(model.infer_vector(['graph'])), 5) + self.assertTrue(len(model['human']), 10) + self.assertTrue(model.vocab['graph'].count, 5) + if (i == 1): + self.assertTrue(hasattr(model, 'syn1')) + else: + self.assertTrue(not hasattr(model, 'syn1')) + if (j == 1): + self.assertTrue(hasattr(model, 'syn1neg')) + else: + self.assertTrue(not hasattr(model, 'syn1neg')) + self.assertTrue(hasattr(model, 'syn0_lockf')) @log_capture() def testBuildVocabWarning(self, l): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 34e7decee4..99d6db4a63 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -488,8 +488,12 @@ def testFinishedTraining(self): for j in [0, 1]: model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) model.finished_training() + self.assertTrue(len(model['human']), 10) self.assertTrue(len(model.vocab), 12) - self.assertTrue(model.vocab['graph'].count, 3) + self.assertTrue(model.vocab['graph'].count, 3) + self.assertTrue(not hasattr(model, 'syn1')) + self.assertTrue(not hasattr(model, 'syn1neg')) + self.assertTrue(not hasattr(model, 'syn0_lockf')) model = word2vec.Word2Vec(sentences, min_count=1) model.save_word2vec_format(testfile(), binary=True) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) From 26e6042dfbeb9aaca922f487e9f6088f43742b8d Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Tue, 1 Nov 2016 18:39:12 +0300 Subject: [PATCH 03/16] fix_print --- gensim/models/doc2vec.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index db83deb80f..c833f5a844 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -392,7 +392,6 @@ def init_sims(self, replace=False): etc., but not `train` or `infer_vector`. """ - print ('HELLO DOC!!!') if getattr(self, 'doctag_syn0norm', None) is None or replace: logger.info("precomputing L2-norms of doc weight vectors") if replace: From ba8c8c401b9fa95843ea66ad05e777b9d83863b7 Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Wed, 2 Nov 2016 17:54:29 +0300 Subject: [PATCH 04/16] flag finished_training fix --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 86dce2f43c..2011186ea7 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1751,6 +1751,7 @@ def __str__(self): return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_lockf = False): + self.training_finished = True if hasattr(self, 'syn1') and not save_syn1: del self.syn1 if hasattr(self, 'syn1neg') and not save_syn1neg: @@ -1762,7 +1763,6 @@ def finished_training(self): """ Discard parametrs that are used in training and score. Use if you're sure you're done training a model. """ - self.training_finished = True for i in xrange(self.syn0.shape[0]): self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) self.syn0norm = self.syn0 From 51a64bab06233843916bc476c1254ab1fda532fb Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Thu, 3 Nov 2016 20:24:05 +0300 Subject: [PATCH 05/16] fix_bug with docvecs, controllability --- gensim/models/doc2vec.py | 15 +++++++++------ gensim/models/word2vec.py | 17 +++++++++++------ gensim/test/test_doc2vec.py | 6 +++--- gensim/test/test_word2vec.py | 8 ++++---- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c833f5a844..214b6fec27 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -778,15 +778,18 @@ def __str__(self): segments.append('t%d' % self.workers) return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) - def finished_training(self): + def discard_model_parameters(self, remove_doctags_vectors=False): """ - Discard parametrs that are used in training and score. Use if you're sure you're done training a model. + Discard parameters that are used in training and score. Use if you're sure you're done training a model. + Use `remove_doctags_vectors` if you don't want to save doctags vectors. + Useful in case when you only need to use infer_vector, + but don't want to use docvecs's most_similar, similarity etc. methods. """ self._minimize_model(self.hs, self.negative > 0, True) - if hasattr(self, 'doctag_syn0'): - del self.doctag_syn0 - if hasattr(self, 'doctag_syn0_lockf'): - del self.doctag_syn0_lockf + if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and remove_doctags_vectors: + del self.docvecs.doctag_syn0 + if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'): + del self.docvecs.doctag_syn0_lockf class TaggedBrownCorpus(object): """Iterate over documents from the Brown corpus (part of NLTK data), yielding diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 2011186ea7..efdb5b3975 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -465,7 +465,7 @@ def __init__( self.total_train_time = 0 self.sorted_vocab = sorted_vocab self.batch_words = batch_words - + self.training_finished = False if sentences is not None: if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") @@ -757,6 +757,8 @@ def train(self, sentences, total_words=None, word_count=0, sentences are the same as those that were used to initially build the vocabulary. """ + if (self.training_finished): + raise RuntimeError("parameters for training were discarded") if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded for Word2Vec, training will be slow. " @@ -1759,13 +1761,16 @@ def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_loc if hasattr(self, 'syn0_lockf') and not save_syn0_lockf: del self.syn0_lockf - def finished_training(self): + def discard_model_parameters(self, replace=False): """ - Discard parametrs that are used in training and score. Use if you're sure you're done training a model. + Discard parameters that are used in training and score. Use if you're sure you're done training a model. + If `replace` is set, forget the original vectors and only keep the normalized + ones = saves lots of memory! """ - for i in xrange(self.syn0.shape[0]): - self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) - self.syn0norm = self.syn0 + if replace: + for i in xrange(self.syn0.shape[0]): + self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) + self.syn0norm = self.syn0 self._minimize_model() def save(self, *args, **kwargs): diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 24f4d588b3..e0310a6088 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -280,12 +280,12 @@ def models_equal(self, model, model2): self.assertEqual(len(model.docvecs.offset2doctag), len(model2.docvecs.offset2doctag)) self.assertTrue(np.allclose(model.docvecs.doctag_syn0, model2.docvecs.doctag_syn0)) - def test_finished_training(self): - """Test doc2vec model after finishing training""" + def test_discard_model_parameters(self): + """Test doc2vec model after discard_model_parameters""" for i in [0, 1]: for j in [0, 1]: model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, hs=i, negative=j) - model.finished_training() + model.discard_model_parameters(remove_doctags_vectors=True) self.assertTrue(len(model['human']), 10) self.assertTrue(model.vocab['graph'].count, 5) if (i == 1): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 99d6db4a63..ee6f1e40da 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -482,12 +482,12 @@ def models_equal(self, model, model2): most_common_word = max(model.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(numpy.allclose(model[most_common_word], model2[most_common_word])) - def testFinishedTraining(self): - """Test word2vec model after finishing training""" + def testDiscardModelParameters(self): + """Test word2vec model after discard_model_parameters""" for i in [0, 1]: for j in [0, 1]: model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) - model.finished_training() + model.discard_model_parameters(replace=True) self.assertTrue(len(model['human']), 10) self.assertTrue(len(model.vocab), 12) self.assertTrue(model.vocab['graph'].count, 3) @@ -497,7 +497,7 @@ def testFinishedTraining(self): model = word2vec.Word2Vec(sentences, min_count=1) model.save_word2vec_format(testfile(), binary=True) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) - norm_only_model.finished_training() + norm_only_model.discard_model_parameters(replace=True) self.assertFalse(numpy.allclose(model['human'], norm_only_model['human'])) @log_capture() From c73098403fcc9129c7d203148b3f5e8aa6d1ec81 Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Wed, 9 Nov 2016 13:23:33 +0300 Subject: [PATCH 06/16] rename flag, flag move, init_sims --- gensim/models/word2vec.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index efdb5b3975..16628a73b5 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -465,7 +465,7 @@ def __init__( self.total_train_time = 0 self.sorted_vocab = sorted_vocab self.batch_words = batch_words - self.training_finished = False + self.model_trimmed_post_training = False if sentences is not None: if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") @@ -757,7 +757,7 @@ def train(self, sentences, total_words=None, word_count=0, sentences are the same as those that were used to initially build the vocabulary. """ - if (self.training_finished): + if (self.model_trimmed_post_training): raise RuntimeError("parameters for training were discarded") if FAST_VERSION < 0: import warnings @@ -1753,13 +1753,13 @@ def __str__(self): return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha) def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_lockf = False): - self.training_finished = True if hasattr(self, 'syn1') and not save_syn1: del self.syn1 if hasattr(self, 'syn1neg') and not save_syn1neg: del self.syn1neg if hasattr(self, 'syn0_lockf') and not save_syn0_lockf: del self.syn0_lockf + self.model_trimmed_post_training = True def discard_model_parameters(self, replace=False): """ @@ -1768,9 +1768,7 @@ def discard_model_parameters(self, replace=False): ones = saves lots of memory! """ if replace: - for i in xrange(self.syn0.shape[0]): - self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) - self.syn0norm = self.syn0 + self.init_sims(replace=True) self._minimize_model() def save(self, *args, **kwargs): From a7cd9ba407ff24bd00147fb87904ad8105a3cc2f Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Wed, 9 Nov 2016 17:21:23 +0300 Subject: [PATCH 07/16] renaming the RuntimeError message --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 16628a73b5..4c67246cf6 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -758,7 +758,7 @@ def train(self, sentences, total_words=None, word_count=0, """ if (self.model_trimmed_post_training): - raise RuntimeError("parameters for training were discarded") + raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded for Word2Vec, training will be slow. " From a8cb0e7cfc122018dde7e439e0615f6c6a9f4c1b Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Thu, 10 Nov 2016 20:48:34 +0300 Subject: [PATCH 08/16] fix, add more tests --- gensim/models/doc2vec.py | 15 +++++++++------ gensim/models/word2vec.py | 2 +- gensim/test/test_doc2vec.py | 32 +++++++++++++++++++------------- gensim/test/test_word2vec.py | 10 ++++++---- 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 214b6fec27..d47db36776 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -778,15 +778,18 @@ def __str__(self): segments.append('t%d' % self.workers) return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) - def discard_model_parameters(self, remove_doctags_vectors=False): + def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): """ Discard parameters that are used in training and score. Use if you're sure you're done training a model. - Use `remove_doctags_vectors` if you don't want to save doctags vectors. - Useful in case when you only need to use infer_vector, - but don't want to use docvecs's most_similar, similarity etc. methods. + Use `remove_doctags_vectors` if you don't want to save doctags vectors, + in this case you can't to use docvecs's most_similar, similarity etc. methods. + Use `no_inference` if you don't want to store parameters that is used for infer_vector method (you will not be able to use infer_vector) """ - self._minimize_model(self.hs, self.negative > 0, True) - if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and remove_doctags_vectors: + if keep_inference: + self._minimize_model(self.hs, self.negative > 0, True) + else: + self._minimize_model(False, False, False) + if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and not keep_doctags_vectors: del self.docvecs.doctag_syn0 if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'): del self.docvecs.doctag_syn0_lockf diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 4c67246cf6..135a00058e 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1761,7 +1761,7 @@ def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_loc del self.syn0_lockf self.model_trimmed_post_training = True - def discard_model_parameters(self, replace=False): + def delete_temporary_training_data(self, replace=False): """ Discard parameters that are used in training and score. Use if you're sure you're done training a model. If `replace` is set, forget the original vectors and only keep the normalized diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index e0310a6088..86749f37ac 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -280,23 +280,29 @@ def models_equal(self, model, model2): self.assertEqual(len(model.docvecs.offset2doctag), len(model2.docvecs.offset2doctag)) self.assertTrue(np.allclose(model.docvecs.doctag_syn0, model2.docvecs.doctag_syn0)) - def test_discard_model_parameters(self): - """Test doc2vec model after discard_model_parameters""" + def test_delete_temporary_training_data(self): + """Test doc2vec model after delete_temporary_training_data""" for i in [0, 1]: for j in [0, 1]: - model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, hs=i, negative=j) - model.discard_model_parameters(remove_doctags_vectors=True) + if i == 0 and j == 0: + continue + model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, window=4, hs=i, negative=j) + model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False) self.assertTrue(len(model['human']), 10) self.assertTrue(model.vocab['graph'].count, 5) - if (i == 1): - self.assertTrue(hasattr(model, 'syn1')) - else: - self.assertTrue(not hasattr(model, 'syn1')) - if (j == 1): - self.assertTrue(hasattr(model, 'syn1neg')) - else: - self.assertTrue(not hasattr(model, 'syn1neg')) - self.assertTrue(hasattr(model, 'syn0_lockf')) + self.assertTrue(not hasattr(model, 'syn1')) + self.assertTrue(not hasattr(model, 'syn1neg')) + self.assertTrue(not hasattr(model, 'syn0_lockf')) + self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0')) + self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0_lockf')) + model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20) + model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) + self.assertTrue(model.docvecs and hasattr(model.docvecs, 'doctag_syn0')) + self.assertTrue(hasattr(model, 'syn1')) + self.model_sanity(model) + model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=1, alpha=0.05, min_count=2, iter=20) + model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) + self.assertTrue(hasattr(model, 'syn1neg')) @log_capture() def testBuildVocabWarning(self, l): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index ee6f1e40da..5ec8d1fb36 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -482,22 +482,24 @@ def models_equal(self, model, model2): most_common_word = max(model.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(numpy.allclose(model[most_common_word], model2[most_common_word])) - def testDiscardModelParameters(self): - """Test word2vec model after discard_model_parameters""" + def testDeleteTemporaryTrainingData(self): + """Test word2vec model after delete_temporary_training_data""" for i in [0, 1]: for j in [0, 1]: model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) - model.discard_model_parameters(replace=True) + model.delete_temporary_training_data(replace=True) self.assertTrue(len(model['human']), 10) self.assertTrue(len(model.vocab), 12) self.assertTrue(model.vocab['graph'].count, 3) self.assertTrue(not hasattr(model, 'syn1')) self.assertTrue(not hasattr(model, 'syn1neg')) self.assertTrue(not hasattr(model, 'syn0_lockf')) + + def testNormalizeAfterTrainingData(self): model = word2vec.Word2Vec(sentences, min_count=1) model.save_word2vec_format(testfile(), binary=True) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) - norm_only_model.discard_model_parameters(replace=True) + norm_only_model.delete_temporary_training_data(replace=True) self.assertFalse(numpy.allclose(model['human'], norm_only_model['human'])) @log_capture() From 18ca26f9e51f36da818400efd8abfe7946740a5a Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Thu, 10 Nov 2016 20:55:27 +0300 Subject: [PATCH 09/16] fix, i == j --- gensim/test/test_doc2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 86749f37ac..cb9fe37ee3 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -284,7 +284,7 @@ def test_delete_temporary_training_data(self): """Test doc2vec model after delete_temporary_training_data""" for i in [0, 1]: for j in [0, 1]: - if i == 0 and j == 0: + if i != j: continue model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, window=4, hs=i, negative=j) model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False) From a258241b2e786ac499956161df606e36c421bcd0 Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Thu, 10 Nov 2016 21:09:11 +0300 Subject: [PATCH 10/16] fix --- gensim/test/test_doc2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index cb9fe37ee3..86749f37ac 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -284,7 +284,7 @@ def test_delete_temporary_training_data(self): """Test doc2vec model after delete_temporary_training_data""" for i in [0, 1]: for j in [0, 1]: - if i != j: + if i == 0 and j == 0: continue model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, window=4, hs=i, negative=j) model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False) From 9acf1191f50a2f2bd6ae5228f1a03e1a5ed76501 Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Thu, 10 Nov 2016 22:32:12 +0300 Subject: [PATCH 11/16] tests_fix --- gensim/test/test_doc2vec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 86749f37ac..cd876e36ec 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -302,6 +302,7 @@ def test_delete_temporary_training_data(self): self.model_sanity(model) model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=1, alpha=0.05, min_count=2, iter=20) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) + self.model_sanity(model) self.assertTrue(hasattr(model, 'syn1neg')) @log_capture() From 66fe5e353a2cf897435a3960f19c34430a8956d1 Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Fri, 11 Nov 2016 01:02:29 +0300 Subject: [PATCH 12/16] delete useless code --- gensim/models/doc2vec.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index d47db36776..b6e6f8c3c3 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -785,9 +785,7 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen in this case you can't to use docvecs's most_similar, similarity etc. methods. Use `no_inference` if you don't want to store parameters that is used for infer_vector method (you will not be able to use infer_vector) """ - if keep_inference: - self._minimize_model(self.hs, self.negative > 0, True) - else: + if not keep_inference: self._minimize_model(False, False, False) if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and not keep_doctags_vectors: del self.docvecs.doctag_syn0 From 4395b75c74337cc1727a15790ccfb33504129e5e Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Fri, 11 Nov 2016 02:14:24 +0300 Subject: [PATCH 13/16] numpy fix --- gensim/test/test_word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index cce1931d48..93bb3d41ab 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -500,7 +500,7 @@ def testNormalizeAfterTrainingData(self): model.save_word2vec_format(testfile(), binary=True) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) norm_only_model.delete_temporary_training_data(replace=True) - self.assertFalse(numpy.allclose(model['human'], norm_only_model['human'])) + self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) @log_capture() def testBuildVocabWarning(self, l): From 06c6028ced6b8dc2e12b45d752a4c7acb6a2c02d Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Sat, 12 Nov 2016 02:34:56 +0300 Subject: [PATCH 14/16] hs,neg in tests; assert parameters existance --- gensim/test/test_doc2vec.py | 7 +++++-- gensim/test/test_word2vec.py | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 3c638af17a..51392889d9 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -291,9 +291,12 @@ def test_delete_temporary_training_data(self): """Test doc2vec model after delete_temporary_training_data""" for i in [0, 1]: for j in [0, 1]: - if i == 0 and j == 0: - continue model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, window=4, hs=i, negative=j) + if i: + self.assertTrue(hasattr(model, 'syn1')) + if j: + self.assertTrue(hasattr(model, 'syn1neg')) + self.assertTrue(hasattr(model, 'syn0_lockf')) model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False) self.assertTrue(len(model['human']), 10) self.assertTrue(model.vocab['graph'].count, 5) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 93bb3d41ab..a5cae365c5 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -487,6 +487,11 @@ def testDeleteTemporaryTrainingData(self): for i in [0, 1]: for j in [0, 1]: model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j) + if i: + self.assertTrue(hasattr(model, 'syn1')) + if j: + self.assertTrue(hasattr(model, 'syn1neg')) + self.assertTrue(hasattr(model, 'syn0_lockf')) model.delete_temporary_training_data(replace=True) self.assertTrue(len(model['human']), 10) self.assertTrue(len(model.vocab), 12) From 5f96aa0f617dbd299d1e340ab8759f63d824fd2d Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Sat, 12 Nov 2016 02:44:01 +0300 Subject: [PATCH 15/16] changelog update --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd73c4f5fa..14a3657972 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ Changes ======= +0.13.5, 2016-11-12 +* Add delete_temporary_training_data() function to word2vec and doc2vec models. (@deepmipt-VladZhukov, [#987](https://github.com/RaRe-Technologies/gensim/pull/987)) 0.13.4, 2016-10-25 * Passed all the params through the apply call in lda.get_document_topics(), test case to use the per_word_topics through the corpus in test_ldamodel (@parthoiiitm, [#978](https://github.com/RaRe-Technologies/gensim/pull/978)) From 84f174e6697f824c2eaa5d1c2a7a42bbbf038452 Mon Sep 17 00:00:00 2001 From: Vlad Zhukov Date: Sun, 13 Nov 2016 15:41:19 +0300 Subject: [PATCH 16/16] rename replace, description fix --- gensim/models/doc2vec.py | 4 ++-- gensim/models/word2vec.py | 6 +++--- gensim/test/test_word2vec.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 5f0f78f3ea..48807a8813 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -781,9 +781,9 @@ def __str__(self): def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): """ Discard parameters that are used in training and score. Use if you're sure you're done training a model. - Use `remove_doctags_vectors` if you don't want to save doctags vectors, + Set `keep_doctags_vectors` to False if you don't want to save doctags vectors, in this case you can't to use docvecs's most_similar, similarity etc. methods. - Use `no_inference` if you don't want to store parameters that is used for infer_vector method (you will not be able to use infer_vector) + Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method """ if not keep_inference: self._minimize_model(False, False, False) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8cb7a761a4..97f98c7614 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1762,13 +1762,13 @@ def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_loc del self.syn0_lockf self.model_trimmed_post_training = True - def delete_temporary_training_data(self, replace=False): + def delete_temporary_training_data(self, replace_word_vectors_with_normalized=False): """ Discard parameters that are used in training and score. Use if you're sure you're done training a model. - If `replace` is set, forget the original vectors and only keep the normalized + If `replace_word_vectors_with_normalized` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! """ - if replace: + if replace_word_vectors_with_normalized: self.init_sims(replace=True) self._minimize_model() diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index a5cae365c5..57d47a98cf 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -492,7 +492,7 @@ def testDeleteTemporaryTrainingData(self): if j: self.assertTrue(hasattr(model, 'syn1neg')) self.assertTrue(hasattr(model, 'syn0_lockf')) - model.delete_temporary_training_data(replace=True) + model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) self.assertTrue(len(model['human']), 10) self.assertTrue(len(model.vocab), 12) self.assertTrue(model.vocab['graph'].count, 3) @@ -504,7 +504,7 @@ def testNormalizeAfterTrainingData(self): model = word2vec.Word2Vec(sentences, min_count=1) model.save_word2vec_format(testfile(), binary=True) norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) - norm_only_model.delete_temporary_training_data(replace=True) + norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) @log_capture()