piskvorky · mpenkov · Mar 7, 2021 · Mar 3, 2021 · Mar 4, 2021 · Mar 4, 2021
diff --git a/gensim/__init__.py b/gensim/__init__.py
@@ -4,11 +4,12 @@
 
 """
 
+__version__ = '4.0.0beta'
+
 import logging
 
 from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils  # noqa:F401
 
-__version__ = '4.0.0beta'
 
 logger = logging.getLogger('gensim')
 if not logger.handlers:  # To ensure reload() doesn't add another one

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -51,12 +51,6 @@
     >>> model.save(fname)
     >>> model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
 
-If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do:
-
-.. sourcecode:: pycon
-
-    >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
-
 Infer vector for a new document:
 
 .. sourcecode:: pycon

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -375,7 +375,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
             Set to 0 for batch learning, > 1 for online iterative learning.
         alpha : {numpy.ndarray, str}, optional
             Can be set to an 1D array of length equal to the number of expected topics that expresses
-            our a-priori belief for the each topics' probability.
+            our a-priori belief for each topics' probability.
             Alternatively default prior selecting strategies can be employed by supplying a string:
 
                 * 'symmetric': Default; uses a fixed symmetric prior per topic,

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -65,6 +65,7 @@
 .. sourcecode:: pycon
 
     >>> vector = model.wv['computer']  # get numpy vector of a word
+    >>> sims = model.wv.most_similar('computer', topn=10)  # get other similar words
 
 The reason for separating the trained vectors into `KeyedVectors` is that if you don't
 need the full model state any more (don't need to continue training), its state can discarded,
@@ -432,6 +433,8 @@ def __init__(
                     "The callbacks provided in this initialization without triggering train will "
                     "be ignored.")
 
+        self.add_lifecycle_event("created", params=str(self))
+
     def build_vocab(
             self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000,
             keep_raw_vocab=False, trim_rule=None, **kwargs,
@@ -470,7 +473,7 @@ def build_vocab(
                 * `min_count` (int) - the minimum count threshold.
 
         **kwargs : object
-            Key word arguments propagated to `self.prepare_vocab`
+            Keyword arguments propagated to `self.prepare_vocab`
 
         """
         self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1)
@@ -481,6 +484,7 @@ def build_vocab(
         report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)
         report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
         self.prepare_weights(update=update)
+        self.add_lifecycle_event("build_vocab", update=update, trim_rule=str(trim_rule))
 
     def build_vocab_from_freq(
             self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False,
@@ -610,13 +614,16 @@ def prepare_vocab(
                 calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1
 
             self.effective_min_count = max(calc_min_count, min_count)
-            logger.info(
-                "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d",
-                self.max_final_vocab, min_count, calc_min_count, self.effective_min_count
+            self.add_lifecycle_event(
+                "prepare_vocab",
+                msg=(
+                    f"max_final_vocab={self.max_final_vocab} and min_count={min_count} resulted "
+                    f"in calc_min_count={calc_min_count}, effective_min_count={self.effective_min_count}"
+                )
             )
 
         if not update:
-            logger.info("Loading a fresh vocabulary")
+            logger.info("Creating a fresh vocabulary")
             retain_total, retain_words = 0, []
             # Discard words less-frequent than min_count
             if not dry_run:
@@ -642,15 +649,22 @@ def prepare_vocab(
                     self.wv.set_vecattr(word, 'count', self.raw_vocab[word])
             original_unique_total = len(retain_words) + drop_unique
             retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
-            logger.info(
-                "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
-                self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
+            self.add_lifecycle_event(
+                "prepare_vocab",
+                msg=(
+                    f"effective_min_count={self.effective_min_count} retains {len(retain_words)} unique "
+                    f"words ({retain_unique_pct}%% of original {original_unique_total}, drops {drop_unique})"
+                ),
             )
+
             original_total = retain_total + drop_total
             retain_pct = retain_total * 100 / max(original_total, 1)
-            logger.info(
-                "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
-                self.effective_min_count, retain_total, retain_pct, original_total, drop_total
+            self.add_lifecycle_event(
+                "prepare_vocab",
+                msg=(
+                    f"effective_min_count={self.effective_min_count} leaves {retain_total} word corpus "
+                    f"({retain_pct}%% of original {original_total}, drops {drop_total})"
+                ),
             )
         else:
             logger.info("Updating model with new vocabulary")
@@ -681,11 +695,13 @@ def prepare_vocab(
             original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique
             pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1)
             new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
-            logger.info(
-                "New added %i unique words (%i%% of original %i) "
-                "and increased the count of %i pre-existing words (%i%% of original %i)",
-                len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words),
-                pre_exist_unique_pct, original_unique_total
+            self.add_lifecycle_event(
+                "prepare_vocab",
+                msg=(
+                    f"New added {len(new_words)} unique words ({new_unique_pct}%% of original "
+                    f"{original_unique_total}) and increased the count of {len(pre_exist_words)} "
+                    f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})"
+                ),
             )
             retain_words = new_words + pre_exist_words
             retain_total = new_total + pre_exist_total
@@ -719,9 +735,12 @@ def prepare_vocab(
             self.raw_vocab = defaultdict(int)
 
         logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique)
-        logger.info(
-            "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
-            downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total
+        self.add_lifecycle_event(
+            "prepare_vocab",
+            msg=(
+                f"downsampling leaves estimated {downsample_total} word corpus "
+                f"({downsample_total * 100.0 / max(retain_total, 1):.1f}%% of prior {retain_total})"
+            ),
         )
 
         # return from each step: words-affected, resulting-corpus-size, extra memory estimates
@@ -774,7 +793,7 @@ def estimate_memory(self, vocab_size=None, report=None):
         report['total'] = sum(report.values())
         logger.info(
             "estimated required memory for %i words and %i dimensions: %i bytes",
-            vocab_size, self.vector_size, report['total']
+            vocab_size, self.vector_size, report['total'],
         )
         return report
 
@@ -889,11 +908,15 @@ def _do_train_epoch(
         work, neu1 = thread_private_mem
 
         if self.sg:
-            examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch,
-                                                        total_examples, total_words, work, neu1, self.compute_loss)
+            examples, tally, raw_tally = train_epoch_sg(
+                self, corpus_file, offset, cython_vocab, cur_epoch,
+                total_examples, total_words, work, neu1, self.compute_loss,
+            )
         else:
-            examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch,
-                                                          total_examples, total_words, work, neu1, self.compute_loss)
+            examples, tally, raw_tally = train_epoch_cbow(
+                self, corpus_file, offset, cython_vocab, cur_epoch,
+                total_examples, total_words, work, neu1, self.compute_loss,
+            )
 
         return examples, tally, raw_tally
 
@@ -1011,11 +1034,13 @@ def train(
         self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words)
         self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs)
 
-        logger.info(
-            "training model with %i workers on %i vocabulary and %i features, "
-            "using sg=%s hs=%s sample=%s negative=%s window=%s",
-            self.workers, len(self.wv), self.layer1_size, self.sg,
-            self.hs, self.sample, self.negative, self.window
+        self.add_lifecycle_event(
+            "train",
+            msg=(
+                f"training model with {self.workers} workers on {len(self.wv)} vocabulary and "
+                f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} "
+                f"negative={self.negative} window={self.window}"
+            ),
         )
 
         self.compute_loss = compute_loss
@@ -1403,8 +1428,9 @@ def _train_epoch(
             thread.start()
 
         trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
-            progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
-            report_delay=report_delay, is_corpus_file_mode=False)
+            progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples,
+            total_words=total_words, report_delay=report_delay, is_corpus_file_mode=False,
+        )
 
         return trained_word_count, raw_word_count, job_tally
 

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -27,11 +27,15 @@
 import subprocess
 import inspect
 import heapq
+from copy import deepcopy
+from datetime import datetime
+import platform
 
 import numpy as np
 import scipy.sparse
 from smart_open import open
 
+from gensim import __version__ as gensim_version
 
 logger = logging.getLogger(__name__)
 
@@ -378,14 +382,63 @@ def call_on_class_only(*args, **kwargs):
 
 
 class SaveLoad:
-    """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
+    """Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.
 
     Warnings
     --------
     This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
     such as lambda functions etc.
 
     """
+    def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event):
+        """
+        Append an event into the `lifecycle_events` attribute of this object.
+        Events are important moments during the object's life, such as "model created",
+        "model saved", "model loaded", etc.
+
+        The `lifecycle_events` attribute is persisted across object's save() / load() operations.
+        It has no impact on the use of the model, but is useful during debugging and support.
+
+        Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event` will
+        do nothing then.
+
+        Parameters
+        ----------
+        event_name : str
+            Name of the event. Can be any label, e.g. "created", "stored" etc.
+        event : dict
+            Key-value mapping to append to `self.event_log`. Should be JSON-serializable, so keep it simple.
+            Can be empty.
+
+            This method will automatically add the following key-values to `event`, so you don't have to specify them:
+
+            - `datetime`: the current time
+            - `gensim`: the current Gensim version
+            - `python`: the current Python version
+            - `platform`: the current platform
+            - `event`: the name of this event
+        log_level : int
+            Also log the complete event dict, at the specified log level. Set to False to not log at all.
+
+        """
+        # See also https://github.com/RaRe-Technologies/gensim/issues/2863
+        event_dict = deepcopy(event)
+        event_dict['datetime'] = datetime.now().isoformat()
+        event_dict['gensim'] = gensim_version
+        event_dict['python'] = sys.version
+        event_dict['platform'] = platform.platform()
+        event_dict['event'] = event_name
+
+        if not hasattr(self, 'lifecycle_events'):
+            logger.info("starting a new internal lifecycle event log for %s", self.__class__.__name__)
+            self.lifecycle_events = []
+
+        if log_level:
+            logger.log(log_level, "recording lifecycle event %s", event_dict)
+
+        if self.lifecycle_events is not None:
+            self.lifecycle_events.append(event_dict)
+
     @classmethod
     def load(cls, fname, mmap=None):
         """Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.
@@ -422,6 +475,7 @@ def load(cls, fname, mmap=None):
         obj = unpickle(fname)
         obj._load_specials(fname, mmap, compress, subname)
         logger.info("loaded %s", fname)
+        obj.add_lifecycle_event("loaded")
         return obj
 
     def _load_specials(self, fname, mmap, compress, subname):
@@ -684,6 +738,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=
             Load object from file.
 
         """
+        self.add_lifecycle_event("saved")
         try:
             _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
             logger.info("saved %s object", self.__class__.__name__)