piskvorky · tmylk · Nov 13, 2016 · Oct 31, 2016 · Nov 1, 2016 · Nov 1, 2016
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -508,8 +508,8 @@ def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_a
         d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps)
         d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps)
         return dot(matutils.unitvec(d1), matutils.unitvec(d2))
-        
-        
+
+
 class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')):
     """A string document tag discovered during the initial vocabulary
     scan. (The document-vector equivalent of a Vocab object.)
@@ -553,7 +553,7 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
 
         `alpha` is the initial learning rate (will linearly drop to zero as training progresses).
 
-        `seed` = for the random number generator. 
+        `seed` = for the random number generator.
         Note that for a fully deterministically-reproducible run, you must also limit the model to
         a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python
         3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED
@@ -570,7 +570,7 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
 
         `workers` = use this many worker threads to train the model (=faster training with multicore machines).
 
-        `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, 
+        `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5,
         but values of 10 or 20 are common in published 'Paragraph Vector' experiments.
 
         `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
@@ -778,6 +778,21 @@ def __str__(self):
             segments.append('t%d' % self.workers)
         return '%s(%s)' % (self.__class__.__name__, ','.join(segments))
 
+    def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True):
+        """
+        Discard parameters that are used in training and score. Use if you're sure you're done training a model.
+        Use `remove_doctags_vectors` if you don't want to save doctags vectors,
+        in this case you can't to use docvecs's most_similar, similarity etc. methods.
+        Use `no_inference` if you don't want to store parameters that is used for infer_vector method (you will not be able to use infer_vector)
+        """
+        if keep_inference:
+            self._minimize_model(self.hs, self.negative > 0, True)
+        else:
+            self._minimize_model(False, False, False)
+        if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and not keep_doctags_vectors:
+            del self.docvecs.doctag_syn0
+        if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'):
+            del self.docvecs.doctag_syn0_lockf
 
 class TaggedBrownCorpus(object):
     """Iterate over documents from the Brown corpus (part of NLTK data), yielding

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -465,7 +465,7 @@ def __init__(
         self.total_train_time = 0
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
-
+        self.model_trimmed_post_training = False
         if sentences is not None:
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
@@ -757,6 +757,8 @@ def train(self, sentences, total_words=None, word_count=0,
         sentences are the same as those that were used to initially build the vocabulary.
 
         """
+        if (self.model_trimmed_post_training):
+            raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
         if FAST_VERSION < 0:
             import warnings
             warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
@@ -1750,6 +1752,25 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
     def __str__(self):
         return "%s(vocab=%s, size=%s, alpha=%s)" % (self.__class__.__name__, len(self.index2word), self.vector_size, self.alpha)
 
+    def _minimize_model(self, save_syn1 = False, save_syn1neg = False, save_syn0_lockf = False):
+        if hasattr(self, 'syn1') and not save_syn1:
+            del self.syn1
+        if hasattr(self, 'syn1neg') and not save_syn1neg:
+            del self.syn1neg
+        if hasattr(self, 'syn0_lockf') and not save_syn0_lockf:
+            del self.syn0_lockf
+        self.model_trimmed_post_training = True
+
+    def delete_temporary_training_data(self, replace=False):
+        """
+        Discard parameters that are used in training and score. Use if you're sure you're done training a model.
+        If `replace` is set, forget the original vectors and only keep the normalized
+        ones = saves lots of memory!
+        """
+        if replace:
+            self.init_sims(replace=True)
+        self._minimize_model()
+
     def save(self, *args, **kwargs):
         # don't bother storing the cached normalized vectors, recalculable table
         kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table'])

diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -280,6 +280,30 @@ def models_equal(self, model, model2):
         self.assertEqual(len(model.docvecs.offset2doctag), len(model2.docvecs.offset2doctag))
         self.assertTrue(np.allclose(model.docvecs.doctag_syn0, model2.docvecs.doctag_syn0))
 
+    def test_delete_temporary_training_data(self):
+        """Test doc2vec model after delete_temporary_training_data"""
+        for i in [0, 1]:
+            for j in [0, 1]:
+                if i == 0 and j == 0:
+                    continue
+                model = doc2vec.Doc2Vec(sentences, size=5, min_count=1, window=4, hs=i, negative=j)
+                model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False)
+                self.assertTrue(len(model['human']), 10)
+                self.assertTrue(model.vocab['graph'].count, 5)
+                self.assertTrue(not hasattr(model, 'syn1'))
+                self.assertTrue(not hasattr(model, 'syn1neg'))
+                self.assertTrue(not hasattr(model, 'syn0_lockf'))
+                self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0'))
+                self.assertTrue(model.docvecs and not hasattr(model.docvecs, 'doctag_syn0_lockf'))
+        model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20)
+        model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
+        self.assertTrue(model.docvecs and hasattr(model.docvecs, 'doctag_syn0'))
+        self.assertTrue(hasattr(model, 'syn1'))
+        self.model_sanity(model)
+        model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=0, negative=1, alpha=0.05, min_count=2, iter=20)
+        model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
+        self.assertTrue(hasattr(model, 'syn1neg'))
+
     @log_capture()
     def testBuildVocabWarning(self, l):
         """Test if logger warning is raised on non-ideal input to a doc2vec model"""

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -434,7 +434,7 @@ def testSimilarities(self):
         model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
         model.build_vocab(sentences)
         model.train(sentences)
-        
+
         self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
         self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees'))
         self.assertRaises(ZeroDivisionError, model.n_similarity, ['graph', 'trees'], [])
@@ -482,6 +482,26 @@ def models_equal(self, model, model2):
         most_common_word = max(model.vocab.items(), key=lambda item: item[1].count)[0]
         self.assertTrue(numpy.allclose(model[most_common_word], model2[most_common_word]))
 
+    def testDeleteTemporaryTrainingData(self):
+        """Test word2vec model after delete_temporary_training_data"""
+        for i in [0, 1]:
+            for j in [0, 1]:
+                model = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=i, negative=j)
+                model.delete_temporary_training_data(replace=True)
+                self.assertTrue(len(model['human']), 10)
+                self.assertTrue(len(model.vocab), 12)
+                self.assertTrue(model.vocab['graph'].count, 3)
+                self.assertTrue(not hasattr(model, 'syn1'))
+                self.assertTrue(not hasattr(model, 'syn1neg'))
+                self.assertTrue(not hasattr(model, 'syn0_lockf'))
+
+    def testNormalizeAfterTrainingData(self):
+        model = word2vec.Word2Vec(sentences, min_count=1)
+        model.save_word2vec_format(testfile(), binary=True)
+        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
+        norm_only_model.delete_temporary_training_data(replace=True)
+        self.assertFalse(numpy.allclose(model['human'], norm_only_model['human']))
+
     @log_capture()
     def testBuildVocabWarning(self, l):
         """Test if warning is raised on non-ideal input to a word2vec model"""