add lifecycle events to remaining models

piskvorky · Mar 4, 2021 · 5e0e127 · 5e0e127
1 parent 4adac6c
commit 5e0e127
Show file tree

Hide file tree

Showing 10 changed files with 52 additions and 24 deletions.
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -77,6 +77,10 @@ def __init__(self, documents=None, prune_at=2000000):
 
         if documents is not None:
             self.add_documents(documents, prune_at=prune_at)
+            self.add_lifecycle_event(
+                "created",
+                msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)",
+            )
 
     def __getitem__(self, tokenid):
         """Get the string token that corresponds to `tokenid`.

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -653,7 +653,7 @@ def _pad_ones(m, new_len):
 
 
 def load_facebook_model(path, encoding='utf-8'):
-    """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file.
+    """Load the model from Facebook's native fasttext `.bin` output file.
 
     Notes
     ------
@@ -835,7 +835,10 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
 
     _check_model(model)
 
-    logger.info("loaded %s weight matrix for fastText model from %s", m.vectors_ngrams.shape, fin.name)
+    model.add_lifecycle_event(
+        "load_fasttext_format",
+        msg=f"loaded {m.vectors_ngrams.shape} weight matrix for fastText model from {fin.name}",
+    )
     return model
 
 

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -1204,7 +1204,6 @@ def _log_evaluate_word_analogies(section):
         logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
         return score
 
-
     def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
         """Compute performance of the model on an analogy test set.
 
@@ -1923,7 +1922,7 @@ def _load_word2vec_format(
 
     kv.add_lifecycle_event(
         "load_word2vec_format",
-        msg=f"loaded {kv.vectors.shape} matrix of type {kv.dtype} from {fname}",
+        msg=f"loaded {kv.vectors.shape} matrix of type {kv.vectors.dtype} from {fname}",
         binary=binary, encoding=encoding,
     )
     return kv
@@ -1951,7 +1950,6 @@ def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
     """Return a numpy array of the given shape. Reuse prior_vectors object or values
     to extent possible. Initialize new values randomly if requested.
 
-    FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK.
     """
     if prior_vectors is None:
         prior_vectors = np.zeros((0, 0))

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -88,16 +88,17 @@
 import logging
 import numbers
 import os
+import time
+from collections import defaultdict
 
 import numpy as np
 from scipy.special import gammaln, psi  # gamma function utils
 from scipy.special import polygamma
-from collections import defaultdict
 
 from gensim import interfaces, utils, matutils
 from gensim.matutils import (
     kullback_leibler, hellinger, jaccard_distance, jensen_shannon,
-    dirichlet_expectation, logsumexp, mean_absolute_difference
+    dirichlet_expectation, logsumexp, mean_absolute_difference,
 )
 from gensim.models import basemodel, CoherenceModel
 from gensim.models.callbacks import Callback
@@ -518,7 +519,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         # if a training corpus was provided, start estimating the model right away
         if corpus is not None:
             use_numpy = self.dispatcher is not None
+            start = time.time()
             self.update(corpus, chunks_as_numpy=use_numpy)
+            self.add_lifecycle_event(
+                "created",
+                msg=f"trained {self} in {time.time() - start:.2f}s",
+            )
 
     def init_dir_prior(self, prior, name):
         """Initialize priors for the Dirichlet distribution.

diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py
@@ -181,7 +181,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
             id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
             decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
             gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
-            minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype
+            minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype,
         )
 
     def update(self, corpus, chunks_as_numpy=False):

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -61,6 +61,7 @@
 
 import logging
 import sys
+import time
 
 import numpy as np
 import scipy.linalg
@@ -351,17 +352,17 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
 
     """
 
-    def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
-                 decay=1.0, distributed=False, onepass=True,
-                 power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64):
-        """Construct an `LsiModel` object.
-
-        Either `corpus` or `id2word` must be supplied in order to train the model.
+    def __init__(
+            self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
+            decay=1.0, distributed=False, onepass=True,
+            power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64
+        ):
+        """Build an LSI model.
 
         Parameters
         ----------
         corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
-            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
+            Stream of document vectors or a sparse matrix of shape (`num_documents`, `num_terms`).
         num_topics : int, optional
             Number of requested factors (latent dimensions)
         id2word : dict of {int: str}, optional
@@ -440,7 +441,12 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
                 raise RuntimeError("failed to initialize distributed LSI (%s)" % err)
 
         if corpus is not None:
+            start = time.time()
             self.add_documents(corpus)
+            self.add_lifecycle_event(
+                "created",
+                msg=f"trained {self} in {time.time() - start:.2f}s",
+            )
 
     def add_documents(self, corpus, chunksize=None, decay=None):
         """Update model with new `corpus`.

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -66,6 +66,7 @@
 from math import log
 import pickle
 from inspect import getfullargspec as getargspec
+import time
 
 from gensim import utils, interfaces
 
@@ -566,7 +567,9 @@ def __init__(
             raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able')
 
         if sentences is not None:
+            start = time.time()
             self.add_vocab(sentences)
+            self.add_lifecycle_event("created", msg=f"built {self} in {time.time() - start:.2f}s")
 
     def __str__(self):
         return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % (
@@ -772,8 +775,9 @@ def __init__(self, phrases_model):
         self.scoring = phrases_model.scoring
         self.connector_words = phrases_model.connector_words
         logger.info('exporting phrases from %s', phrases_model)
+        start = time.time()
         self.phrasegrams = phrases_model.export_phrases()
-        logger.info('exported %s', self)
+        self.add_lifecycle_event("created", msg=f"exported {self} from {phrases_model} in {time.time() - start:.2f}s")
 
     def __str__(self):
         return "%s<%i phrases, min_count=%s, threshold=%s>" % (

diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py
@@ -67,6 +67,7 @@ def __init__(self, corpus, id2word=None, num_topics=300):
         self.num_topics = num_topics
         if corpus is not None:
             self.initialize(corpus)
+            self.add_lifecycle_event("created", msg=f"created {self}")
 
     def __str__(self):
         return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
@@ -463,11 +463,14 @@ def initialize(self, corpus):
         self.dfs = dfs
         self.term_lengths = None
         # and finally compute the idf weights
-        logger.info(
-            "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
-            self.num_docs, max(dfs.keys()) + 1 if dfs else 0, self.num_nnz
-        )
         self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
+        self.add_lifecycle_event(
+            "initialize",
+            msg=(
+                f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}"
+                f" features ({self.num_nnz} matrix non-zeros)"
+            ),
+        )
 
     def __getitem__(self, bow, eps=1e-12):
         """Get the tf-idf representation of an input vector and/or corpus.

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -192,6 +192,7 @@
 import itertools
 import copy
 from queue import Queue, Empty
+import time
 
 from numpy import float32 as REAL
 import numpy as np
@@ -1053,6 +1054,7 @@ def train(
         raw_word_count = 0
         start = default_timer() - 0.00001
         job_tally = 0
+        start = time.time()
 
         for cur_epoch in range(self.epochs):
             for callback in callbacks:
@@ -1084,6 +1086,7 @@ def train(
 
         for callback in callbacks:
             callback.on_train_end(self)
+
         return trained_word_count, raw_word_count
 
     def _worker_loop_corpusfile(
@@ -1662,10 +1665,10 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_
             Total number of jobs processed during training.
 
         """
-        logger.info(
-            "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
-            raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed
-        )
+        self.add_lifecycle_event("train", msg=(
+            f"training on {raw_word_count} raw words ({trained_word_count} effective words) "
+            f"took {total_elapsed:.1f}s, {trained_word_count / total_elapsed:.0f} effective words/s"
+        ))
 
     def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1):
         """Score the log probability for a sequence of sentences.