Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record lifecycle events in Gensim models #3060

Merged
merged 12 commits into from
Mar 7, 2021
3 changes: 2 additions & 1 deletion gensim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

"""

__version__ = '4.0.0beta'
piskvorky marked this conversation as resolved.
Show resolved Hide resolved

import logging

from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils # noqa:F401

__version__ = '4.0.0beta'

logger = logging.getLogger('gensim')
if not logger.handlers: # To ensure reload() doesn't add another one
Expand Down
6 changes: 0 additions & 6 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@
>>> model.save(fname)
>>> model = Doc2Vec.load(fname) # you can continue training with the loaded model!

If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do:

.. sourcecode:: pycon

>>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Infer vector for a new document:

.. sourcecode:: pycon
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
Set to 0 for batch learning, > 1 for online iterative learning.
alpha : {numpy.ndarray, str}, optional
Can be set to an 1D array of length equal to the number of expected topics that expresses
our a-priori belief for the each topics' probability.
our a-priori belief for each topics' probability.
Alternatively default prior selecting strategies can be employed by supplying a string:

* 'symmetric': Default; uses a fixed symmetric prior per topic,
Expand Down
88 changes: 57 additions & 31 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
.. sourcecode:: pycon

>>> vector = model.wv['computer'] # get numpy vector of a word
>>> sims = model.wv.most_similar('computer', topn=10) # get other similar words

The reason for separating the trained vectors into `KeyedVectors` is that if you don't
need the full model state any more (don't need to continue training), its state can discarded,
Expand Down Expand Up @@ -432,6 +433,8 @@ def __init__(
"The callbacks provided in this initialization without triggering train will "
"be ignored.")

self.add_lifecycle_event("created", params=str(self))

def build_vocab(
self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000,
keep_raw_vocab=False, trim_rule=None, **kwargs,
Expand Down Expand Up @@ -470,7 +473,7 @@ def build_vocab(
* `min_count` (int) - the minimum count threshold.

**kwargs : object
Key word arguments propagated to `self.prepare_vocab`
Keyword arguments propagated to `self.prepare_vocab`

"""
self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1)
Expand All @@ -481,6 +484,7 @@ def build_vocab(
report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.prepare_weights(update=update)
self.add_lifecycle_event("build_vocab", update=update, trim_rule=str(trim_rule))
piskvorky marked this conversation as resolved.
Show resolved Hide resolved

def build_vocab_from_freq(
self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False,
Expand Down Expand Up @@ -610,13 +614,16 @@ def prepare_vocab(
calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1

self.effective_min_count = max(calc_min_count, min_count)
logger.info(
"max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d",
self.max_final_vocab, min_count, calc_min_count, self.effective_min_count
self.add_lifecycle_event(
"prepare_vocab",
msg=(
f"max_final_vocab={self.max_final_vocab} and min_count={min_count} resulted "
f"in calc_min_count={calc_min_count}, effective_min_count={self.effective_min_count}"
)
)

if not update:
logger.info("Loading a fresh vocabulary")
logger.info("Creating a fresh vocabulary")
retain_total, retain_words = 0, []
# Discard words less-frequent than min_count
if not dry_run:
Expand All @@ -642,15 +649,22 @@ def prepare_vocab(
self.wv.set_vecattr(word, 'count', self.raw_vocab[word])
original_unique_total = len(retain_words) + drop_unique
retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
logger.info(
"effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
self.add_lifecycle_event(
"prepare_vocab",
msg=(
f"effective_min_count={self.effective_min_count} retains {len(retain_words)} unique "
f"words ({retain_unique_pct}%% of original {original_unique_total}, drops {drop_unique})"
),
)

original_total = retain_total + drop_total
retain_pct = retain_total * 100 / max(original_total, 1)
logger.info(
"effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
self.effective_min_count, retain_total, retain_pct, original_total, drop_total
self.add_lifecycle_event(
"prepare_vocab",
msg=(
f"effective_min_count={self.effective_min_count} leaves {retain_total} word corpus "
f"({retain_pct}%% of original {original_total}, drops {drop_total})"
),
)
else:
logger.info("Updating model with new vocabulary")
Expand Down Expand Up @@ -681,11 +695,13 @@ def prepare_vocab(
original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique
pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1)
new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
logger.info(
"New added %i unique words (%i%% of original %i) "
"and increased the count of %i pre-existing words (%i%% of original %i)",
len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words),
pre_exist_unique_pct, original_unique_total
self.add_lifecycle_event(
"prepare_vocab",
msg=(
f"New added {len(new_words)} unique words ({new_unique_pct}%% of original "
f"{original_unique_total}) and increased the count of {len(pre_exist_words)} "
f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})"
),
)
retain_words = new_words + pre_exist_words
retain_total = new_total + pre_exist_total
Expand Down Expand Up @@ -719,9 +735,12 @@ def prepare_vocab(
self.raw_vocab = defaultdict(int)

logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique)
logger.info(
"downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total
self.add_lifecycle_event(
"prepare_vocab",
msg=(
f"downsampling leaves estimated {downsample_total} word corpus "
f"({downsample_total * 100.0 / max(retain_total, 1):.1f}%% of prior {retain_total})"
),
)

# return from each step: words-affected, resulting-corpus-size, extra memory estimates
Expand Down Expand Up @@ -774,7 +793,7 @@ def estimate_memory(self, vocab_size=None, report=None):
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words and %i dimensions: %i bytes",
vocab_size, self.vector_size, report['total']
vocab_size, self.vector_size, report['total'],
)
return report

Expand Down Expand Up @@ -889,11 +908,15 @@ def _do_train_epoch(
work, neu1 = thread_private_mem

if self.sg:
examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss)
examples, tally, raw_tally = train_epoch_sg(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss,
)
else:
examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss)
examples, tally, raw_tally = train_epoch_cbow(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss,
)

return examples, tally, raw_tally

Expand Down Expand Up @@ -1011,11 +1034,13 @@ def train(
self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words)
self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs)

logger.info(
"training model with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s negative=%s window=%s",
self.workers, len(self.wv), self.layer1_size, self.sg,
self.hs, self.sample, self.negative, self.window
self.add_lifecycle_event(
"train",
msg=(
f"training model with {self.workers} workers on {len(self.wv)} vocabulary and "
f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} "
f"negative={self.negative} window={self.window}"
),
)

self.compute_loss = compute_loss
Expand Down Expand Up @@ -1403,8 +1428,9 @@ def _train_epoch(
thread.start()

trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words,
report_delay=report_delay, is_corpus_file_mode=False)
progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples,
total_words=total_words, report_delay=report_delay, is_corpus_file_mode=False,
)

return trained_word_count, raw_word_count, job_tally

Expand Down
57 changes: 56 additions & 1 deletion gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@
import subprocess
import inspect
import heapq
from copy import deepcopy
from datetime import datetime
import platform

import numpy as np
import scipy.sparse
from smart_open import open

from gensim import __version__ as gensim_version

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -378,14 +382,63 @@ def call_on_class_only(*args, **kwargs):


class SaveLoad:
"""Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
"""Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.

Warnings
--------
This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
such as lambda functions etc.

"""
def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event):
"""
Append an event into the `lifecycle_events` attribute of this object.
Events are important moments during the object's life, such as "model created",
"model saved", "model loaded", etc.

The `lifecycle_events` attribute is persisted across object's save() / load() operations.
It has no impact on the use of the model, but is useful during debugging and support.

Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event` will
do nothing then.

Parameters
----------
event_name : str
Name of the event. Can be any label, e.g. "created", "stored" etc.
event : dict
Key-value mapping to append to `self.event_log`. Should be JSON-serializable, so keep it simple.
Can be empty.

This method will automatically add the following key-values to `event`, so you don't have to specify them:

- `datetime`: the current time
- `gensim`: the current Gensim version
- `python`: the current Python version
- `platform`: the current platform
- `event`: the name of this event
log_level : int
Also log the complete event dict, at the specified log level. Set to False to not log at all.

"""
# See also https://github.com/RaRe-Technologies/gensim/issues/2863
event_dict = deepcopy(event)
event_dict['datetime'] = datetime.now().isoformat()
event_dict['gensim'] = gensim_version
event_dict['python'] = sys.version
event_dict['platform'] = platform.platform()
event_dict['event'] = event_name

if not hasattr(self, 'lifecycle_events'):
logger.info("starting a new internal lifecycle event log for %s", self.__class__.__name__)
self.lifecycle_events = []

if log_level:
logger.log(log_level, "recording lifecycle event %s", event_dict)

if self.lifecycle_events is not None:
self.lifecycle_events.append(event_dict)

@classmethod
def load(cls, fname, mmap=None):
"""Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.
Expand Down Expand Up @@ -422,6 +475,7 @@ def load(cls, fname, mmap=None):
obj = unpickle(fname)
obj._load_specials(fname, mmap, compress, subname)
logger.info("loaded %s", fname)
obj.add_lifecycle_event("loaded")
return obj

def _load_specials(self, fname, mmap, compress, subname):
Expand Down Expand Up @@ -684,6 +738,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=
Load object from file.

"""
self.add_lifecycle_event("saved")
try:
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
logger.info("saved %s object", self.__class__.__name__)
Expand Down