diff --git a/CHANGELOG.md b/CHANGELOG.md index 6996ed2477..25737c095f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Changes ## Unreleased - fix RuntimeError in export_phrases (change defaultdict to dict) (PR [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041), [@thalishsajeed](https://github.com/thalishsajeed)) +- Record lifecycle events in Gensim models (PR [#3060](https://github.com/RaRe-Technologies/gensim/pull/3060), [@piskvorky](https://github.com/piskvorky)) ## 4.0.0beta, 2020-10-31 diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index ec31c5964f..95121b30ae 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -15,6 +15,12 @@ What are you trying to achieve? What is the expected result? What are you seeing Include full tracebacks, logs and datasets if necessary. Please keep the examples minimal ("minimal reproducible example"). +If your problem is with a specific Gensim model (word2vec, lsimodel, doc2vec, fasttext, ldamodel etc), include the following: + +```python +print(my_model.lifecycle_events) +``` + #### Versions Please provide the output of: diff --git a/docs/src/conf.py b/docs/src/conf.py index ba4973d061..036d84ac0d 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -63,7 +63,7 @@ # The short X.Y version. version = '4.0.0beta' # The full version, including alpha/beta/rc tags. -release = '4.0.0beta' +release = '4.0.0rc1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gensim/__init__.py b/gensim/__init__.py index d6d0cd5054..db325ebfbd 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,11 +4,12 @@ """ +__version__ = '4.0.0rc1' + import logging from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils # noqa:F401 -__version__ = '4.0.0beta' logger = logging.getLogger('gensim') if not logger.handlers: # To ensure reload() doesn't add another one diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index e046134250..6f40f252aa 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -77,6 +77,10 @@ def __init__(self, documents=None, prune_at=2000000): if documents is not None: self.add_documents(documents, prune_at=prune_at) + self.add_lifecycle_event( + "created", + msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)", + ) def __getitem__(self, tokenid): """Get the string token that corresponds to `tokenid`. diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 51fdfdce43..0dcad7165a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -51,12 +51,6 @@ >>> model.save(fname) >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! -If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do: - -.. sourcecode:: pycon - - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) - Infer vector for a new document: .. sourcecode:: pycon diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 38dd4172f6..b52a77bcc4 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -653,7 +653,7 @@ def _pad_ones(m, new_len): def load_facebook_model(path, encoding='utf-8'): - """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file. + """Load the model from Facebook's native fasttext `.bin` output file. Notes ------ @@ -835,7 +835,10 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): _check_model(model) - logger.info("loaded %s weight matrix for fastText model from %s", m.vectors_ngrams.shape, fin.name) + model.add_lifecycle_event( + "load_fasttext_format", + msg=f"loaded {m.vectors_ngrams.shape} weight matrix for fastText model from {fin.name}", + ) return model diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 193c5f8f0f..2251a65efb 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -191,8 +191,9 @@ class KeyedVectors(utils.SaveLoad): + def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): - """Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec` + """Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec` and related models. Used to perform operations on the vectors such as vector lookup, distance, similarity etc. @@ -215,7 +216,7 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless another type is provided here. mapfile_path : string, optional - FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975. + Currently unused. """ self.vector_size = vector_size # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos` @@ -259,7 +260,7 @@ def _load_specials(self, *args, **kwargs): self._upconvert_old_vocab() def _upconvert_old_vocab(self): - """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects""" + """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects.""" old_vocab = self.__dict__.pop('vocab', None) self.key_to_index = {} for k in old_vocab.keys(): @@ -277,6 +278,7 @@ def allocate_vecattrs(self, attrs=None, types=None): The length of the index_to_key list is canonical 'intended size' of KeyedVectors, even if other properties (vectors array) hasn't yet been allocated or expanded. So this allocation targets that size. + """ # with no arguments, adjust lengths of existing vecattr arrays to match length of index_to_key if attrs is None: @@ -351,13 +353,8 @@ def get_vecattr(self, key, attr): def resize_vectors(self, seed=0): """Make underlying vectors match index_to_key size; random-initialize any new rows.""" - target_shape = (len(self.index_to_key), self.vector_size) self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed) - # FIXME BEFORE 4.0.0 PER #2955 / #2975 : support memmap & cleanup -# if hasattr(self, 'mapfile_path') and self.mapfile_path: -# self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL) - self.allocate_vecattrs() self.norms = None @@ -370,7 +367,7 @@ def __getitem__(self, key_or_keys): Parameters ---------- key_or_keys : {str, list of str, int, list of int} - Requested key or list-of-keys + Requested key or list-of-keys. Returns ------- @@ -784,7 +781,7 @@ def most_similar( return result[:topn] def similar_by_word(self, word, topn=10, restrict_vocab=None): - """Compatibility alias for similar_by_key()""" + """Compatibility alias for similar_by_key().""" return self.similar_by_key(word, topn, restrict_vocab) def similar_by_key(self, key, topn=10, restrict_vocab=None): @@ -1193,14 +1190,19 @@ def _log_evaluate_word_analogies(section): Returns ------- float - Accuracy score. + Accuracy score if at least one prediction was made (correct or incorrect). + + Or return 0.0 if there were no predictions at all in this section. """ correct, incorrect = len(section['correct']), len(section['incorrect']) - if correct + incorrect > 0: - score = correct / (correct + incorrect) - logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) - return score + + if correct + incorrect == 0: + return 0.0 + + score = correct / (correct + incorrect) + logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) + return score def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """Compute performance of the model on an analogy test set. @@ -1324,7 +1326,7 @@ def log_accuracy(section): if correct + incorrect > 0: logger.info( "%s: %.1f%% (%i/%i)", - section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect + section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect, ) @staticmethod @@ -1463,7 +1465,7 @@ def init_sims(self, replace=False): def unit_normalize_all(self): """Destructively scale all vectors to unit-length. - (You cannot sensibly continue training after such a step.) + You cannot sensibly continue training after such a step. """ self.fill_norms() @@ -1495,7 +1497,8 @@ def relative_cosine_similarity(self, wa, wb, topn=10): """ sims = self.similar_by_word(wa, topn) - assert sims, "Failed code invariant: list of similar words must never be empty." + if not sims: + raise ValueError("Cannot calculate relative cosine similarity without any similar words.") rcs = float(self.similarity(wa, wb)) / (sum(sim for _, sim in sims)) return rcs @@ -1583,7 +1586,7 @@ def load_word2vec_format( cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL, no_header=False, ): - """Load the input-hidden weight matrix from the original C word2vec-tool format. + """Load KeyedVectors from a file produced by the original C word2vec-tool format. Warnings -------- @@ -1660,7 +1663,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if not vector_size == self.vector_size: raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) - # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? + # TODO: maybe mismatched vectors still useful enough to merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in range(vocab_size): @@ -1688,7 +1691,10 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut overlap_count += 1 self.vectors[self.get_index(word)] = weights self.vectors_lockf[self.get_index(word)] = lockf # lock-factor: 0.0=no changes - logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.vectors.shape, fname) + self.add_lifecycle_event( + "intersect_word2vec_format", + msg=f"merged {overlap_count} vectors into {self.vectors.shape} matrix from {fname}", + ) def _upconvert_old_d2vkv(self): """Convert a deserialized older Doc2VecKeyedVectors instance to latest generic KeyedVectors""" @@ -1721,6 +1727,7 @@ def similarity_unseen_docs(self, *args, **kwargs): class CompatVocab: + def __init__(self, **kwargs): """A single vocabulary item, used internally for collecting per-word frequency/sampling info, and for constructing binary trees (incl. both word leaves and inner nodes). @@ -1847,7 +1854,7 @@ def _load_word2vec_format( fname : str The file path to the saved word2vec-format file. fvocab : str, optional - File path to the vocabulary.Word counts are read from `fvocab` filename, if set + File path to the vocabulary. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool, optional If True, indicates whether the data is in binary word2vec format. @@ -1913,7 +1920,11 @@ def _load_word2vec_format( kv.vectors = ascontiguousarray(kv.vectors[: len(kv)]) assert (len(kv), vector_size) == kv.vectors.shape - logger.info("loaded %s matrix from %s", kv.vectors.shape, fname) + kv.add_lifecycle_event( + "load_word2vec_format", + msg=f"loaded {kv.vectors.shape} matrix of type {kv.vectors.dtype} from {fname}", + binary=binary, encoding=encoding, + ) return kv @@ -1939,7 +1950,6 @@ def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL): """Return a numpy array of the given shape. Reuse prior_vectors object or values to extent possible. Initialize new values randomly if requested. - FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK. """ if prior_vectors is None: prior_vectors = np.zeros((0, 0)) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 01a4029705..439aeb91fb 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -88,16 +88,17 @@ import logging import numbers import os +import time +from collections import defaultdict import numpy as np from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma -from collections import defaultdict from gensim import interfaces, utils, matutils from gensim.matutils import ( kullback_leibler, hellinger, jaccard_distance, jensen_shannon, - dirichlet_expectation, logsumexp, mean_absolute_difference + dirichlet_expectation, logsumexp, mean_absolute_difference, ) from gensim.models import basemodel, CoherenceModel from gensim.models.callbacks import Callback @@ -375,7 +376,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, Set to 0 for batch learning, > 1 for online iterative learning. alpha : {numpy.ndarray, str}, optional Can be set to an 1D array of length equal to the number of expected topics that expresses - our a-priori belief for the each topics' probability. + our a-priori belief for each topics' probability. Alternatively default prior selecting strategies can be employed by supplying a string: * 'symmetric': Default; uses a fixed symmetric prior per topic, @@ -518,7 +519,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None + start = time.time() self.update(corpus, chunks_as_numpy=use_numpy) + self.add_lifecycle_event( + "created", + msg=f"trained {self} in {time.time() - start:.2f}s", + ) def init_dir_prior(self, prior, name): """Initialize priors for the Dirichlet distribution. diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index a86daf1831..b65fcdd240 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -181,7 +181,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability, - minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype + minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype, ) def update(self, corpus, chunks_as_numpy=False): diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index d40690fd70..39a82bfc20 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -61,6 +61,7 @@ import logging import sys +import time import numpy as np import scipy.linalg @@ -351,17 +352,17 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """ - def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, distributed=False, onepass=True, - power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64): - """Construct an `LsiModel` object. - - Either `corpus` or `id2word` must be supplied in order to train the model. + def __init__( + self, corpus=None, num_topics=200, id2word=None, chunksize=20000, + decay=1.0, distributed=False, onepass=True, + power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64 + ): + """Build an LSI model. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional - Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`). + Stream of document vectors or a sparse matrix of shape (`num_documents`, `num_terms`). num_topics : int, optional Number of requested factors (latent dimensions) id2word : dict of {int: str}, optional @@ -440,7 +441,12 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, raise RuntimeError("failed to initialize distributed LSI (%s)" % err) if corpus is not None: + start = time.time() self.add_documents(corpus) + self.add_lifecycle_event( + "created", + msg=f"trained {self} in {time.time() - start:.2f}s", + ) def add_documents(self, corpus, chunksize=None, decay=None): """Update model with new `corpus`. diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 410fb9dced..8e20333b8f 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -66,6 +66,7 @@ from math import log import pickle from inspect import getfullargspec as getargspec +import time from gensim import utils, interfaces @@ -566,7 +567,9 @@ def __init__( raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able') if sentences is not None: + start = time.time() self.add_vocab(sentences) + self.add_lifecycle_event("created", msg=f"built {self} in {time.time() - start:.2f}s") def __str__(self): return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( @@ -772,8 +775,9 @@ def __init__(self, phrases_model): self.scoring = phrases_model.scoring self.connector_words = phrases_model.connector_words logger.info('exporting phrases from %s', phrases_model) + start = time.time() self.phrasegrams = phrases_model.export_phrases() - logger.info('exported %s', self) + self.add_lifecycle_event("created", msg=f"exported {self} from {phrases_model} in {time.time() - start:.2f}s") def __str__(self): return "%s<%i phrases, min_count=%s, threshold=%s>" % ( diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index a8bf0044fa..1b2bf9fbb2 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -67,6 +67,7 @@ def __init__(self, corpus, id2word=None, num_topics=300): self.num_topics = num_topics if corpus is not None: self.initialize(corpus) + self.add_lifecycle_event("created", msg=f"created {self}") def __str__(self): return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 06ba520e90..4152f3eb3d 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -463,11 +463,14 @@ def initialize(self, corpus): self.dfs = dfs self.term_lengths = None # and finally compute the idf weights - logger.info( - "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", - self.num_docs, max(dfs.keys()) + 1 if dfs else 0, self.num_nnz - ) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) + self.add_lifecycle_event( + "initialize", + msg=( + f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}" + f" features ({self.num_nnz} matrix non-zeros)" + ), + ) def __getitem__(self, bow, eps=1e-12): """Get the tf-idf representation of an input vector and/or corpus. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 87e15b7d60..77be91e5c6 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -65,6 +65,7 @@ .. sourcecode:: pycon >>> vector = model.wv['computer'] # get numpy vector of a word + >>> sims = model.wv.most_similar('computer', topn=10) # get other similar words The reason for separating the trained vectors into `KeyedVectors` is that if you don't need the full model state any more (don't need to continue training), its state can discarded, @@ -432,6 +433,8 @@ def __init__( "The callbacks provided in this initialization without triggering train will " "be ignored.") + self.add_lifecycle_event("created", params=str(self)) + def build_vocab( self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs, @@ -470,7 +473,7 @@ def build_vocab( * `min_count` (int) - the minimum count threshold. **kwargs : object - Key word arguments propagated to `self.prepare_vocab` + Keyword arguments propagated to `self.prepare_vocab`. """ self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) @@ -481,6 +484,7 @@ def build_vocab( report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) + self.add_lifecycle_event("build_vocab", update=update, trim_rule=str(trim_rule)) def build_vocab_from_freq( self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, @@ -610,13 +614,16 @@ def prepare_vocab( calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 self.effective_min_count = max(calc_min_count, min_count) - logger.info( - "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", - self.max_final_vocab, min_count, calc_min_count, self.effective_min_count + self.add_lifecycle_event( + "prepare_vocab", + msg=( + f"max_final_vocab={self.max_final_vocab} and min_count={min_count} resulted " + f"in calc_min_count={calc_min_count}, effective_min_count={self.effective_min_count}" + ) ) if not update: - logger.info("Loading a fresh vocabulary") + logger.info("Creating a fresh vocabulary") retain_total, retain_words = 0, [] # Discard words less-frequent than min_count if not dry_run: @@ -642,15 +649,22 @@ def prepare_vocab( self.wv.set_vecattr(word, 'count', self.raw_vocab[word]) original_unique_total = len(retain_words) + drop_unique retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique + self.add_lifecycle_event( + "prepare_vocab", + msg=( + f"effective_min_count={self.effective_min_count} retains {len(retain_words)} unique " + f"words ({retain_unique_pct}%% of original {original_unique_total}, drops {drop_unique})" + ), ) + original_total = retain_total + drop_total retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - self.effective_min_count, retain_total, retain_pct, original_total, drop_total + self.add_lifecycle_event( + "prepare_vocab", + msg=( + f"effective_min_count={self.effective_min_count} leaves {retain_total} word corpus " + f"({retain_pct}%% of original {original_total}, drops {drop_total})" + ), ) else: logger.info("Updating model with new vocabulary") @@ -681,11 +695,13 @@ def prepare_vocab( original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total + self.add_lifecycle_event( + "prepare_vocab", + msg=( + f"added {len(new_words)} new unique words ({new_unique_pct}%% of original " + f"{original_unique_total}) and increased the count of {len(pre_exist_words)} " + f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})" + ), ) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total @@ -719,9 +735,12 @@ def prepare_vocab( self.raw_vocab = defaultdict(int) logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total + self.add_lifecycle_event( + "prepare_vocab", + msg=( + f"downsampling leaves estimated {downsample_total} word corpus " + f"({downsample_total * 100.0 / max(retain_total, 1):.1f}%% of prior {retain_total})" + ), ) # return from each step: words-affected, resulting-corpus-size, extra memory estimates @@ -774,7 +793,7 @@ def estimate_memory(self, vocab_size=None, report=None): report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] + vocab_size, self.vector_size, report['total'], ) return report @@ -889,11 +908,15 @@ def _do_train_epoch( work, neu1 = thread_private_mem if self.sg: - examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss) + examples, tally, raw_tally = train_epoch_sg( + self, corpus_file, offset, cython_vocab, cur_epoch, + total_examples, total_words, work, neu1, self.compute_loss, + ) else: - examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss) + examples, tally, raw_tally = train_epoch_cbow( + self, corpus_file, offset, cython_vocab, cur_epoch, + total_examples, total_words, work, neu1, self.compute_loss, + ) return examples, tally, raw_tally @@ -1011,11 +1034,13 @@ def train( self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) - logger.info( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv), self.layer1_size, self.sg, - self.hs, self.sample, self.negative, self.window + self.add_lifecycle_event( + "train", + msg=( + f"training model with {self.workers} workers on {len(self.wv)} vocabulary and " + f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} " + f"negative={self.negative} window={self.window}" + ), ) self.compute_loss = compute_loss @@ -1059,6 +1084,7 @@ def train( for callback in callbacks: callback.on_train_end(self) + return trained_word_count, raw_word_count def _worker_loop_corpusfile( @@ -1403,8 +1429,9 @@ def _train_epoch( thread.start() trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, - report_delay=report_delay, is_corpus_file_mode=False) + progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, + total_words=total_words, report_delay=report_delay, is_corpus_file_mode=False, + ) return trained_word_count, raw_word_count, job_tally @@ -1602,7 +1629,7 @@ def _log_epoch_end( """ logger.info( "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed + cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed, ) # don't warn if training in file-based mode, because it's expected behavior @@ -1636,10 +1663,10 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_ Total number of jobs processed during training. """ - logger.info( - "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed - ) + self.add_lifecycle_event("train", msg=( + f"training on {raw_word_count} raw words ({trained_word_count} effective words) " + f"took {total_elapsed:.1f}s, {trained_word_count / total_elapsed:.0f} effective words/s" + )) def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): """Score the log probability for a sequence of sentences. diff --git a/gensim/utils.py b/gensim/utils.py index fd524d2194..cf5d6b6499 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -27,11 +27,15 @@ import subprocess import inspect import heapq +from copy import deepcopy +from datetime import datetime +import platform import numpy as np import scipy.sparse from smart_open import open +from gensim import __version__ as gensim_version logger = logging.getLogger(__name__) @@ -378,7 +382,7 @@ def call_on_class_only(*args, **kwargs): class SaveLoad: - """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods. + """Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods. Warnings -------- @@ -386,6 +390,60 @@ class SaveLoad: such as lambda functions etc. """ + def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): + """ + Append an event into the `lifecycle_events` attribute of this object, and also + optionally log the event at `log_level`. + + Events are important moments during the object's life, such as "model created", + "model saved", "model loaded", etc. + + The `lifecycle_events` attribute is persisted across object's :meth:`~gensim.utils.SaveLoad.save` + and :meth:`~gensim.utils.SaveLoad.load` operations. It has no impact on the use of the model, + but is useful during debugging and support. + + Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event()` + will not record events into `self.lifecycle_events` then. + + Parameters + ---------- + event_name : str + Name of the event. Can be any label, e.g. "created", "stored" etc. + event : dict + Key-value mapping to append to `self.lifecycle_events`. Should be JSON-serializable, so keep it simple. + Can be empty. + + This method will automatically add the following key-values to `event`, so you don't have to specify them: + + - `datetime`: the current date & time + - `gensim`: the current Gensim version + - `python`: the current Python version + - `platform`: the current platform + - `event`: the name of this event + log_level : int + Also log the complete event dict, at the specified log level. Set to False to not log at all. + + """ + # See also https://github.com/RaRe-Technologies/gensim/issues/2863 + + event_dict = deepcopy(event) + event_dict['datetime'] = datetime.now().isoformat() + event_dict['gensim'] = gensim_version + event_dict['python'] = sys.version + event_dict['platform'] = platform.platform() + event_dict['event'] = event_name + + if not hasattr(self, 'lifecycle_events'): + # Avoid calling str(self), the object may not be fully initialized yet at this point. + logger.debug("starting a new internal lifecycle event log for %s", self.__class__.__name__) + self.lifecycle_events = [] + + if log_level: + logger.log(log_level, "%s lifecycle event %s", self.__class__.__name__, event_dict) + + if self.lifecycle_events is not None: + self.lifecycle_events.append(event_dict) + @classmethod def load(cls, fname, mmap=None): """Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file. @@ -421,7 +479,7 @@ def load(cls, fname, mmap=None): obj = unpickle(fname) obj._load_specials(fname, mmap, compress, subname) - logger.info("loaded %s", fname) + obj.add_lifecycle_event("loaded", fname=fname) return obj def _load_specials(self, fname, mmap, compress, subname): @@ -535,8 +593,6 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro in separate files. The automatic check is not performed in this case. """ - logger.info("saving %s under %s, separately %s", self, fname, separately) - compress, subname = SaveLoad._adapt_by_suffix(fname) restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, @@ -684,6 +740,13 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= Load object from file. """ + self.add_lifecycle_event( + "saving", + fname_or_handle=str(fname_or_handle), + separately=str(separately), + sep_limit=sep_limit, + ignore=ignore, + ) try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object", self.__class__.__name__) diff --git a/release/bump_version.py b/release/bump_version.py index 083463acaf..12d09a2368 100644 --- a/release/bump_version.py +++ b/release/bump_version.py @@ -1,4 +1,13 @@ -"""Bumps the version of gensim in all the required places.""" +""" +Bump the version of Gensim in all the required places. + +Usage: python3 bump_version.py + +Example: + python3 bump_version.py "4.0.0beta" "4.0.0rc1" + +""" + import os.path import re import sys diff --git a/setup.py b/setup.py index 97e8a5523b..07fd74d2ae 100644 --- a/setup.py +++ b/setup.py @@ -335,7 +335,7 @@ def run(self): setup( name='gensim', - version='4.0.0beta', + version='4.0.0rc1', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION,