From 4b559ccf9a00e9cea521d6328d859a85c51993a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Wed, 3 Mar 2021 21:16:05 +0100 Subject: [PATCH 01/12] fix docs --- gensim/models/doc2vec.py | 6 ------ gensim/models/ldamodel.py | 2 +- gensim/models/word2vec.py | 1 + 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 51fdfdce43..0dcad7165a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -51,12 +51,6 @@ >>> model.save(fname) >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! -If you're finished training a model (=no more updates, only querying, reduce memory usage), you can do: - -.. sourcecode:: pycon - - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) - Infer vector for a new document: .. sourcecode:: pycon diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 01a4029705..6ebc0e8b4f 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -375,7 +375,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, Set to 0 for batch learning, > 1 for online iterative learning. alpha : {numpy.ndarray, str}, optional Can be set to an 1D array of length equal to the number of expected topics that expresses - our a-priori belief for the each topics' probability. + our a-priori belief for each topics' probability. Alternatively default prior selecting strategies can be employed by supplying a string: * 'symmetric': Default; uses a fixed symmetric prior per topic, diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 87e15b7d60..24c4700369 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -65,6 +65,7 @@ .. sourcecode:: pycon >>> vector = model.wv['computer'] # get numpy vector of a word + >>> sims = model.wv.most_similar('computer', topn=10) # get other similar words The reason for separating the trained vectors into `KeyedVectors` is that if you don't need the full model state any more (don't need to continue training), its state can discarded, From 05052f200f4a07bdc773a5ffd47b5d85d55ac79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 01:20:23 +0100 Subject: [PATCH 02/12] re #2863: record lifecycle events --- gensim/__init__.py | 3 +- gensim/models/word2vec.py | 76 ++++++++++++++++++++++++++------------- gensim/utils.py | 56 ++++++++++++++++++++++++++++- 3 files changed, 108 insertions(+), 27 deletions(-) diff --git a/gensim/__init__.py b/gensim/__init__.py index d6d0cd5054..f61aec9dcc 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,11 +4,12 @@ """ +__version__ = '4.0.0beta' + import logging from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils # noqa:F401 -__version__ = '4.0.0beta' logger = logging.getLogger('gensim') if not logger.handlers: # To ensure reload() doesn't add another one diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 24c4700369..f888d36592 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -433,6 +433,8 @@ def __init__( "The callbacks provided in this initialization without triggering train will " "be ignored.") + self.add_lifecycle_event("created", params=str(self)) + def build_vocab( self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs, @@ -471,7 +473,7 @@ def build_vocab( * `min_count` (int) - the minimum count threshold. **kwargs : object - Key word arguments propagated to `self.prepare_vocab` + Keyword arguments propagated to `self.prepare_vocab` """ self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) @@ -482,6 +484,7 @@ def build_vocab( report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) + self.add_lifecycle_event("build_vocab", update=update, trim_rule=str(trim_rule)) def build_vocab_from_freq( self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, @@ -611,13 +614,16 @@ def prepare_vocab( calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 self.effective_min_count = max(calc_min_count, min_count) - logger.info( - "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", - self.max_final_vocab, min_count, calc_min_count, self.effective_min_count + self.add_lifecycle_event( + "prepare_vocab", + msg=( + "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d" % + (self.max_final_vocab, min_count, calc_min_count, self.effective_min_count, ) + ) ) if not update: - logger.info("Loading a fresh vocabulary") + logger.info("Creating a fresh vocabulary") retain_total, retain_words = 0, [] # Discard words less-frequent than min_count if not dry_run: @@ -643,15 +649,22 @@ def prepare_vocab( self.wv.set_vecattr(word, 'count', self.raw_vocab[word]) original_unique_total = len(retain_words) + drop_unique retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique + self.add_lifecycle_event( + "prepare_vocab", + msg=( + "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)" % + (self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique,) + ), ) + original_total = retain_total + drop_total retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - self.effective_min_count, retain_total, retain_pct, original_total, drop_total + self.add_lifecycle_event( + "prepare_vocab", + msg=( + "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)" % + (self.effective_min_count, retain_total, retain_pct, original_total, drop_total, ) + ) ) else: logger.info("Updating model with new vocabulary") @@ -682,11 +695,16 @@ def prepare_vocab( original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total + self.add_lifecycle_event( + "prepare_vocab", + msg=( + "New added %i unique words (%i%% of original %i) " + "and increased the count of %i pre-existing words (%i%% of original %i)" % + ( + len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), + pre_exist_unique_pct, original_unique_total, + ) + ), ) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total @@ -720,9 +738,12 @@ def prepare_vocab( self.raw_vocab = defaultdict(int) logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total + self.add_lifecycle_event( + "prepare_vocab", + msg=( + "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)" % + (downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total, ) + ), ) # return from each step: words-affected, resulting-corpus-size, extra memory estimates @@ -775,7 +796,7 @@ def estimate_memory(self, vocab_size=None, report=None): report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] + vocab_size, self.vector_size, report['total'], ) return report @@ -1012,11 +1033,16 @@ def train( self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) - logger.info( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv), self.layer1_size, self.sg, - self.hs, self.sample, self.negative, self.window + self.add_lifecycle_event( + "train", + msg=( + "training model with %i workers on %i vocabulary and %i features, " + "using sg=%s hs=%s sample=%s negative=%s window=%s" % + ( + self.workers, len(self.wv), self.layer1_size, self.sg, self.hs, + self.sample, self.negative, self.window, + ) + ), ) self.compute_loss = compute_loss diff --git a/gensim/utils.py b/gensim/utils.py index fd524d2194..2bce2a0c3a 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -27,11 +27,15 @@ import subprocess import inspect import heapq +from copy import deepcopy +from datetime import datetime +import platform import numpy as np import scipy.sparse from smart_open import open +from gensim import __version__ as gensim_version logger = logging.getLogger(__name__) @@ -378,7 +382,7 @@ def call_on_class_only(*args, **kwargs): class SaveLoad: - """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods. + """Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods. Warnings -------- @@ -386,6 +390,54 @@ class SaveLoad: such as lambda functions etc. """ + def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): + """ + Append an event into the `lifecycle_events` attribute of this object. + Events are important moments during the object's life, such as "model created", + "model saved", "model loaded", etc. + + The `lifecycle_events` attribute is persisted across object's save() / load() operations. + It has no impact on the use of the model, but is useful during debugging and support. + + Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event` will + do nothing then. + + Parameters + ---------- + event_name : str + Name of the event. Can be any label, e.g. "created", "stored" etc. + event : dict + Key-value mapping to append to `self.event_log`. Should be JSON-serializable, so keep it simple. + Can be empty. + + This method will automatically add the following key-values to `event`, so you don't have to specify them: + + - `datetime`: the current time + - `gensim`: the current Gensim version + - `python`: the current Python version + - `platform`: the current platform + - `event`: the name of this event + log_level : int + Also log the complete event dict, at the specified log level. Set to False to not log at all. + + """ + # See also https://github.com/RaRe-Technologies/gensim/issues/2863 + if not hasattr(self, 'lifecycle_events'): + logger.info("starting a new internal lifecycle event log for %s", self.__class__.__name__) + self.lifecycle_events = [] + elif self.lifecycle_events is None: # None = keep no lifecycle log + return + + event_dict = deepcopy(event) + event_dict['datetime'] = datetime.now().isoformat() + event_dict['gensim'] = gensim_version + event_dict['python'] = sys.version + event_dict['platform'] = platform.platform() + event_dict['event'] = event_name + if log_level: + logger.log(log_level, "recording lifecycle event %s", event_dict) + self.lifecycle_events.append(event_dict) + @classmethod def load(cls, fname, mmap=None): """Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file. @@ -422,6 +474,7 @@ def load(cls, fname, mmap=None): obj = unpickle(fname) obj._load_specials(fname, mmap, compress, subname) logger.info("loaded %s", fname) + obj.add_lifecycle_event("loaded") return obj def _load_specials(self, fname, mmap, compress, subname): @@ -684,6 +737,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= Load object from file. """ + self.add_lifecycle_event("saved") try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object", self.__class__.__name__) From 71bf6948f30614e08e2a97006b0100bc96f2f7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 01:26:43 +0100 Subject: [PATCH 03/12] log events even if lifecycle attribute is turned off --- gensim/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gensim/utils.py b/gensim/utils.py index 2bce2a0c3a..fe80f10240 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -422,21 +422,22 @@ def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): """ # See also https://github.com/RaRe-Technologies/gensim/issues/2863 - if not hasattr(self, 'lifecycle_events'): - logger.info("starting a new internal lifecycle event log for %s", self.__class__.__name__) - self.lifecycle_events = [] - elif self.lifecycle_events is None: # None = keep no lifecycle log - return - event_dict = deepcopy(event) event_dict['datetime'] = datetime.now().isoformat() event_dict['gensim'] = gensim_version event_dict['python'] = sys.version event_dict['platform'] = platform.platform() event_dict['event'] = event_name + + if not hasattr(self, 'lifecycle_events'): + logger.info("starting a new internal lifecycle event log for %s", self.__class__.__name__) + self.lifecycle_events = [] + if log_level: logger.log(log_level, "recording lifecycle event %s", event_dict) - self.lifecycle_events.append(event_dict) + + if self.lifecycle_events is not None: + self.lifecycle_events.append(event_dict) @classmethod def load(cls, fname, mmap=None): From f59044f67fd6e5065b280248eccc5d5a1ece2de6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 01:40:13 +0100 Subject: [PATCH 04/12] fix overlong lines + use f-strings --- gensim/models/word2vec.py | 53 +++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index f888d36592..484828cb05 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -617,8 +617,8 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d" % - (self.max_final_vocab, min_count, calc_min_count, self.effective_min_count, ) + f"max_final_vocab={self.max_final_vocab} and min_count={min_count} resulted " + f"in calc_min_count={calc_min_count}, effective_min_count={self.effective_min_count}" ) ) @@ -652,8 +652,8 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)" % - (self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique,) + f"effective_min_count={self.effective_min_count} retains {len(retain_words)} unique " + f"words ({retain_unique_pct}%% of original {original_unique_total}, drops {drop_unique})" ), ) @@ -662,9 +662,9 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)" % - (self.effective_min_count, retain_total, retain_pct, original_total, drop_total, ) - ) + f"effective_min_count={self.effective_min_count} leaves {retain_total} word corpus " + f"({retain_pct}%% of original {original_total}, drops {drop_total})" + ), ) else: logger.info("Updating model with new vocabulary") @@ -698,12 +698,9 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)" % - ( - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total, - ) + f"New added {len(new_words)} unique words ({new_unique_pct}%% of original " + f"{original_unique_total}) and increased the count of {len(pre_exist_words)} " + f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})" ), ) retain_words = new_words + pre_exist_words @@ -741,8 +738,8 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)" % - (downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total, ) + f"downsampling leaves estimated {downsample_total} word corpus " + f"({downsample_total * 100.0 / max(retain_total, 1):.1f}%% of prior {retain_total})" ), ) @@ -911,11 +908,15 @@ def _do_train_epoch( work, neu1 = thread_private_mem if self.sg: - examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss) + examples, tally, raw_tally = train_epoch_sg( + self, corpus_file, offset, cython_vocab, cur_epoch, + total_examples, total_words, work, neu1, self.compute_loss, + ) else: - examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1, self.compute_loss) + examples, tally, raw_tally = train_epoch_cbow( + self, corpus_file, offset, cython_vocab, cur_epoch, + total_examples, total_words, work, neu1, self.compute_loss, + ) return examples, tally, raw_tally @@ -1036,12 +1037,9 @@ def train( self.add_lifecycle_event( "train", msg=( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s" % - ( - self.workers, len(self.wv), self.layer1_size, self.sg, self.hs, - self.sample, self.negative, self.window, - ) + f"training model with {self.workers} workers on {len(self.wv)} vocabulary and " + f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} " + f"negative={self.negative} window={self.window}" ), ) @@ -1430,8 +1428,9 @@ def _train_epoch( thread.start() trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, - report_delay=report_delay, is_corpus_file_mode=False) + progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, + total_words=total_words, report_delay=report_delay, is_corpus_file_mode=False, + ) return trained_word_count, raw_word_count, job_tally From b3a25fb8d25c1c19f05aad5f9fcd2360ac92258b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 11:04:53 +0100 Subject: [PATCH 05/12] bump up internal version + improve docs --- docs/src/conf.py | 2 +- gensim/__init__.py | 2 +- gensim/models/word2vec.py | 4 ++-- gensim/utils.py | 21 +++++++++++++-------- release/bump_version.py | 11 ++++++++++- setup.py | 2 +- 6 files changed, 28 insertions(+), 14 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index ba4973d061..036d84ac0d 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -63,7 +63,7 @@ # The short X.Y version. version = '4.0.0beta' # The full version, including alpha/beta/rc tags. -release = '4.0.0beta' +release = '4.0.0rc1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gensim/__init__.py b/gensim/__init__.py index f61aec9dcc..db325ebfbd 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,7 +4,7 @@ """ -__version__ = '4.0.0beta' +__version__ = '4.0.0rc1' import logging diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 484828cb05..6cb01c10ec 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -473,7 +473,7 @@ def build_vocab( * `min_count` (int) - the minimum count threshold. **kwargs : object - Keyword arguments propagated to `self.prepare_vocab` + Keyword arguments propagated to `self.prepare_vocab`. """ self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=1) @@ -698,7 +698,7 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - f"New added {len(new_words)} unique words ({new_unique_pct}%% of original " + f"added {len(new_words)} new unique words ({new_unique_pct}%% of original " f"{original_unique_total}) and increased the count of {len(pre_exist_words)} " f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})" ), diff --git a/gensim/utils.py b/gensim/utils.py index fe80f10240..93d0bf462a 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -392,27 +392,30 @@ class SaveLoad: """ def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): """ - Append an event into the `lifecycle_events` attribute of this object. + Append an event into the `lifecycle_events` attribute of this object, and also + optionally log the event at `log_level`. + Events are important moments during the object's life, such as "model created", "model saved", "model loaded", etc. - The `lifecycle_events` attribute is persisted across object's save() / load() operations. - It has no impact on the use of the model, but is useful during debugging and support. + The `lifecycle_events` attribute is persisted across object's :meth:`~gensim.utils.SaveLoad.save` + and :meth:`~gensim.utils.SaveLoad.load` operations. It has no impact on the use of the model, + but is useful during debugging and support. - Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event` will - do nothing then. + Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event()` + will not record events into `self.lifecycle_events` then. Parameters ---------- event_name : str Name of the event. Can be any label, e.g. "created", "stored" etc. event : dict - Key-value mapping to append to `self.event_log`. Should be JSON-serializable, so keep it simple. + Key-value mapping to append to `self.lifecycle_events`. Should be JSON-serializable, so keep it simple. Can be empty. This method will automatically add the following key-values to `event`, so you don't have to specify them: - - `datetime`: the current time + - `datetime`: the current date & time - `gensim`: the current Gensim version - `python`: the current Python version - `platform`: the current platform @@ -422,6 +425,7 @@ def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): """ # See also https://github.com/RaRe-Technologies/gensim/issues/2863 + event_dict = deepcopy(event) event_dict['datetime'] = datetime.now().isoformat() event_dict['gensim'] = gensim_version @@ -430,7 +434,8 @@ def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): event_dict['event'] = event_name if not hasattr(self, 'lifecycle_events'): - logger.info("starting a new internal lifecycle event log for %s", self.__class__.__name__) + # Avoid calling str(self), the object may not be fully initialized yet at this point. + logger.debug("starting a new internal lifecycle event log for %s", self.__class__.__name__) self.lifecycle_events = [] if log_level: diff --git a/release/bump_version.py b/release/bump_version.py index 083463acaf..12d09a2368 100644 --- a/release/bump_version.py +++ b/release/bump_version.py @@ -1,4 +1,13 @@ -"""Bumps the version of gensim in all the required places.""" +""" +Bump the version of Gensim in all the required places. + +Usage: python3 bump_version.py <OLD_VERSION> <NEW_VERSION> + +Example: + python3 bump_version.py "4.0.0beta" "4.0.0rc1" + +""" + import os.path import re import sys diff --git a/setup.py b/setup.py index 97e8a5523b..07fd74d2ae 100644 --- a/setup.py +++ b/setup.py @@ -335,7 +335,7 @@ def run(self): setup( name='gensim', - version='4.0.0beta', + version='4.0.0rc1', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION, From 8d7b7d1f5e16b6cfa51e85d3500b241b92ff7c69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 11:15:56 +0100 Subject: [PATCH 06/12] record more info for load + save --- gensim/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gensim/utils.py b/gensim/utils.py index 93d0bf462a..c32c65f75d 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -479,8 +479,7 @@ def load(cls, fname, mmap=None): obj = unpickle(fname) obj._load_specials(fname, mmap, compress, subname) - logger.info("loaded %s", fname) - obj.add_lifecycle_event("loaded") + obj.add_lifecycle_event("loaded", fname=fname) return obj def _load_specials(self, fname, mmap, compress, subname): @@ -594,8 +593,6 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro in separate files. The automatic check is not performed in this case. """ - logger.info("saving %s under %s, separately %s", self, fname, separately) - compress, subname = SaveLoad._adapt_by_suffix(fname) restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, @@ -743,7 +740,13 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= Load object from file. """ - self.add_lifecycle_event("saved") + self.add_lifecycle_event( + "saving", + fname_or_handle=str(fname_or_handle), + separately=str(separately), + sep_limit=sep_limit, + ignore=ignore, + ) try: _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) logger.info("saved %s object", self.__class__.__name__) From 4adac6c5134c790f89f04facec721d196f97f4a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 19:22:19 +0100 Subject: [PATCH 07/12] lifecycle events for KeyedVectors --- gensim/models/keyedvectors.py | 58 +++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 193c5f8f0f..4fe31bcd13 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -191,8 +191,9 @@ class KeyedVectors(utils.SaveLoad): + def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): - """Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec` + """Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec` and related models. Used to perform operations on the vectors such as vector lookup, distance, similarity etc. @@ -215,7 +216,7 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless another type is provided here. mapfile_path : string, optional - FIXME: UNDER CONSTRUCTION / WILL CHANGE PRE-4.0.0 PER #2955 / #2975. + Currently unused. """ self.vector_size = vector_size # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos` @@ -259,7 +260,7 @@ def _load_specials(self, *args, **kwargs): self._upconvert_old_vocab() def _upconvert_old_vocab(self): - """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects""" + """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects.""" old_vocab = self.__dict__.pop('vocab', None) self.key_to_index = {} for k in old_vocab.keys(): @@ -277,6 +278,7 @@ def allocate_vecattrs(self, attrs=None, types=None): The length of the index_to_key list is canonical 'intended size' of KeyedVectors, even if other properties (vectors array) hasn't yet been allocated or expanded. So this allocation targets that size. + """ # with no arguments, adjust lengths of existing vecattr arrays to match length of index_to_key if attrs is None: @@ -351,13 +353,8 @@ def get_vecattr(self, key, attr): def resize_vectors(self, seed=0): """Make underlying vectors match index_to_key size; random-initialize any new rows.""" - target_shape = (len(self.index_to_key), self.vector_size) self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed) - # FIXME BEFORE 4.0.0 PER #2955 / #2975 : support memmap & cleanup -# if hasattr(self, 'mapfile_path') and self.mapfile_path: -# self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL) - self.allocate_vecattrs() self.norms = None @@ -370,7 +367,7 @@ def __getitem__(self, key_or_keys): Parameters ---------- key_or_keys : {str, list of str, int, list of int} - Requested key or list-of-keys + Requested key or list-of-keys. Returns ------- @@ -784,7 +781,7 @@ def most_similar( return result[:topn] def similar_by_word(self, word, topn=10, restrict_vocab=None): - """Compatibility alias for similar_by_key()""" + """Compatibility alias for similar_by_key().""" return self.similar_by_key(word, topn, restrict_vocab) def similar_by_key(self, key, topn=10, restrict_vocab=None): @@ -1193,14 +1190,20 @@ def _log_evaluate_word_analogies(section): Returns ------- float - Accuracy score. + Accuracy score if at least one prediction was made (correct or incorrect). + + Or return 0.0 if there were no predictions at all in this section. """ correct, incorrect = len(section['correct']), len(section['incorrect']) - if correct + incorrect > 0: - score = correct / (correct + incorrect) - logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) - return score + + if correct + incorrect == 0: + return 0.0 + + score = correct / (correct + incorrect) + logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) + return score + def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """Compute performance of the model on an analogy test set. @@ -1324,7 +1327,7 @@ def log_accuracy(section): if correct + incorrect > 0: logger.info( "%s: %.1f%% (%i/%i)", - section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect + section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect, ) @staticmethod @@ -1463,7 +1466,7 @@ def init_sims(self, replace=False): def unit_normalize_all(self): """Destructively scale all vectors to unit-length. - (You cannot sensibly continue training after such a step.) + You cannot sensibly continue training after such a step. """ self.fill_norms() @@ -1495,7 +1498,8 @@ def relative_cosine_similarity(self, wa, wb, topn=10): """ sims = self.similar_by_word(wa, topn) - assert sims, "Failed code invariant: list of similar words must never be empty." + if not sims: + raise ValueError("Cannot calculate relative cosine similarity without any similar words.") rcs = float(self.similarity(wa, wb)) / (sum(sim for _, sim in sims)) return rcs @@ -1583,7 +1587,7 @@ def load_word2vec_format( cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL, no_header=False, ): - """Load the input-hidden weight matrix from the original C word2vec-tool format. + """Load KeyedVectors from a file produced by the original C word2vec-tool format. Warnings -------- @@ -1660,7 +1664,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if not vector_size == self.vector_size: raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) - # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? + # TODO: maybe mismatched vectors still useful enough to merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in range(vocab_size): @@ -1688,7 +1692,10 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut overlap_count += 1 self.vectors[self.get_index(word)] = weights self.vectors_lockf[self.get_index(word)] = lockf # lock-factor: 0.0=no changes - logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.vectors.shape, fname) + self.add_lifecycle_event( + "intersect_word2vec_format", + msg=f"merged {overlap_count} vectors into {self.vectors.shape} matrix from {fname}", + ) def _upconvert_old_d2vkv(self): """Convert a deserialized older Doc2VecKeyedVectors instance to latest generic KeyedVectors""" @@ -1721,6 +1728,7 @@ def similarity_unseen_docs(self, *args, **kwargs): class CompatVocab: + def __init__(self, **kwargs): """A single vocabulary item, used internally for collecting per-word frequency/sampling info, and for constructing binary trees (incl. both word leaves and inner nodes). @@ -1847,7 +1855,7 @@ def _load_word2vec_format( fname : str The file path to the saved word2vec-format file. fvocab : str, optional - File path to the vocabulary.Word counts are read from `fvocab` filename, if set + File path to the vocabulary. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool, optional If True, indicates whether the data is in binary word2vec format. @@ -1913,7 +1921,11 @@ def _load_word2vec_format( kv.vectors = ascontiguousarray(kv.vectors[: len(kv)]) assert (len(kv), vector_size) == kv.vectors.shape - logger.info("loaded %s matrix from %s", kv.vectors.shape, fname) + kv.add_lifecycle_event( + "load_word2vec_format", + msg=f"loaded {kv.vectors.shape} matrix of type {kv.dtype} from {fname}", + binary=binary, encoding=encoding, + ) return kv From 5e0e1271033aebdb6d3394ce67a1b5076f3a4c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Thu, 4 Mar 2021 19:51:47 +0100 Subject: [PATCH 08/12] add lifecycle events to remaining models --- gensim/corpora/dictionary.py | 4 ++++ gensim/models/fasttext.py | 7 +++++-- gensim/models/keyedvectors.py | 4 +--- gensim/models/ldamodel.py | 10 ++++++++-- gensim/models/ldamulticore.py | 2 +- gensim/models/lsimodel.py | 20 +++++++++++++------- gensim/models/phrases.py | 6 +++++- gensim/models/rpmodel.py | 1 + gensim/models/tfidfmodel.py | 11 +++++++---- gensim/models/word2vec.py | 11 +++++++---- 10 files changed, 52 insertions(+), 24 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index e046134250..6f40f252aa 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -77,6 +77,10 @@ def __init__(self, documents=None, prune_at=2000000): if documents is not None: self.add_documents(documents, prune_at=prune_at) + self.add_lifecycle_event( + "created", + msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)", + ) def __getitem__(self, tokenid): """Get the string token that corresponds to `tokenid`. diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 38dd4172f6..b52a77bcc4 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -653,7 +653,7 @@ def _pad_ones(m, new_len): def load_facebook_model(path, encoding='utf-8'): - """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file. + """Load the model from Facebook's native fasttext `.bin` output file. Notes ------ @@ -835,7 +835,10 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): _check_model(model) - logger.info("loaded %s weight matrix for fastText model from %s", m.vectors_ngrams.shape, fin.name) + model.add_lifecycle_event( + "load_fasttext_format", + msg=f"loaded {m.vectors_ngrams.shape} weight matrix for fastText model from {fin.name}", + ) return model diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 4fe31bcd13..2251a65efb 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1204,7 +1204,6 @@ def _log_evaluate_word_analogies(section): logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) return score - def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """Compute performance of the model on an analogy test set. @@ -1923,7 +1922,7 @@ def _load_word2vec_format( kv.add_lifecycle_event( "load_word2vec_format", - msg=f"loaded {kv.vectors.shape} matrix of type {kv.dtype} from {fname}", + msg=f"loaded {kv.vectors.shape} matrix of type {kv.vectors.dtype} from {fname}", binary=binary, encoding=encoding, ) return kv @@ -1951,7 +1950,6 @@ def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL): """Return a numpy array of the given shape. Reuse prior_vectors object or values to extent possible. Initialize new values randomly if requested. - FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK. """ if prior_vectors is None: prior_vectors = np.zeros((0, 0)) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6ebc0e8b4f..439aeb91fb 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -88,16 +88,17 @@ import logging import numbers import os +import time +from collections import defaultdict import numpy as np from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma -from collections import defaultdict from gensim import interfaces, utils, matutils from gensim.matutils import ( kullback_leibler, hellinger, jaccard_distance, jensen_shannon, - dirichlet_expectation, logsumexp, mean_absolute_difference + dirichlet_expectation, logsumexp, mean_absolute_difference, ) from gensim.models import basemodel, CoherenceModel from gensim.models.callbacks import Callback @@ -518,7 +519,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None + start = time.time() self.update(corpus, chunks_as_numpy=use_numpy) + self.add_lifecycle_event( + "created", + msg=f"trained {self} in {time.time() - start:.2f}s", + ) def init_dir_prior(self, prior, name): """Initialize priors for the Dirichlet distribution. diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index a86daf1831..b65fcdd240 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -181,7 +181,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability, - minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype + minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype, ) def update(self, corpus, chunks_as_numpy=False): diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index d40690fd70..39a82bfc20 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -61,6 +61,7 @@ import logging import sys +import time import numpy as np import scipy.linalg @@ -351,17 +352,17 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """ - def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, distributed=False, onepass=True, - power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64): - """Construct an `LsiModel` object. - - Either `corpus` or `id2word` must be supplied in order to train the model. + def __init__( + self, corpus=None, num_topics=200, id2word=None, chunksize=20000, + decay=1.0, distributed=False, onepass=True, + power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64 + ): + """Build an LSI model. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional - Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`). + Stream of document vectors or a sparse matrix of shape (`num_documents`, `num_terms`). num_topics : int, optional Number of requested factors (latent dimensions) id2word : dict of {int: str}, optional @@ -440,7 +441,12 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, raise RuntimeError("failed to initialize distributed LSI (%s)" % err) if corpus is not None: + start = time.time() self.add_documents(corpus) + self.add_lifecycle_event( + "created", + msg=f"trained {self} in {time.time() - start:.2f}s", + ) def add_documents(self, corpus, chunksize=None, decay=None): """Update model with new `corpus`. diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 410fb9dced..8e20333b8f 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -66,6 +66,7 @@ from math import log import pickle from inspect import getfullargspec as getargspec +import time from gensim import utils, interfaces @@ -566,7 +567,9 @@ def __init__( raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able') if sentences is not None: + start = time.time() self.add_vocab(sentences) + self.add_lifecycle_event("created", msg=f"built {self} in {time.time() - start:.2f}s") def __str__(self): return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( @@ -772,8 +775,9 @@ def __init__(self, phrases_model): self.scoring = phrases_model.scoring self.connector_words = phrases_model.connector_words logger.info('exporting phrases from %s', phrases_model) + start = time.time() self.phrasegrams = phrases_model.export_phrases() - logger.info('exported %s', self) + self.add_lifecycle_event("created", msg=f"exported {self} from {phrases_model} in {time.time() - start:.2f}s") def __str__(self): return "%s<%i phrases, min_count=%s, threshold=%s>" % ( diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index a8bf0044fa..1b2bf9fbb2 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -67,6 +67,7 @@ def __init__(self, corpus, id2word=None, num_topics=300): self.num_topics = num_topics if corpus is not None: self.initialize(corpus) + self.add_lifecycle_event("created", msg=f"created {self}") def __str__(self): return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 06ba520e90..4152f3eb3d 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -463,11 +463,14 @@ def initialize(self, corpus): self.dfs = dfs self.term_lengths = None # and finally compute the idf weights - logger.info( - "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", - self.num_docs, max(dfs.keys()) + 1 if dfs else 0, self.num_nnz - ) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) + self.add_lifecycle_event( + "initialize", + msg=( + f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}" + f" features ({self.num_nnz} matrix non-zeros)" + ), + ) def __getitem__(self, bow, eps=1e-12): """Get the tf-idf representation of an input vector and/or corpus. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 6cb01c10ec..9c35300554 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -192,6 +192,7 @@ import itertools import copy from queue import Queue, Empty +import time from numpy import float32 as REAL import numpy as np @@ -1053,6 +1054,7 @@ def train( raw_word_count = 0 start = default_timer() - 0.00001 job_tally = 0 + start = time.time() for cur_epoch in range(self.epochs): for callback in callbacks: @@ -1084,6 +1086,7 @@ def train( for callback in callbacks: callback.on_train_end(self) + return trained_word_count, raw_word_count def _worker_loop_corpusfile( @@ -1662,10 +1665,10 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_ Total number of jobs processed during training. """ - logger.info( - "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed - ) + self.add_lifecycle_event("train", msg=( + f"training on {raw_word_count} raw words ({trained_word_count} effective words) " + f"took {total_elapsed:.1f}s, {trained_word_count / total_elapsed:.0f} effective words/s" + )) def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): """Score the log probability for a sequence of sentences. From 76310da9ae50807451172e3e8eb6bbb931aa2682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Sun, 7 Mar 2021 13:29:26 +0100 Subject: [PATCH 09/12] ask for lifecycle log in ISSUE_TEMPLATE --- ISSUE_TEMPLATE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index ec31c5964f..95121b30ae 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -15,6 +15,12 @@ What are you trying to achieve? What is the expected result? What are you seeing Include full tracebacks, logs and datasets if necessary. Please keep the examples minimal ("minimal reproducible example"). +If your problem is with a specific Gensim model (word2vec, lsimodel, doc2vec, fasttext, ldamodel etc), include the following: + +```python +print(my_model.lifecycle_events) +``` + #### Versions Please provide the output of: From 0526d2a2d4c8a72c4fe8f83bb521feeec88d5cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz> Date: Sun, 7 Mar 2021 13:30:33 +0100 Subject: [PATCH 10/12] improve logging --- gensim/models/word2vec.py | 3 +-- gensim/utils.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9c35300554..5dbd405c07 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1054,7 +1054,6 @@ def train( raw_word_count = 0 start = default_timer() - 0.00001 job_tally = 0 - start = time.time() for cur_epoch in range(self.epochs): for callback in callbacks: @@ -1631,7 +1630,7 @@ def _log_epoch_end( """ logger.info( "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed + cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed, ) # don't warn if training in file-based mode, because it's expected behavior diff --git a/gensim/utils.py b/gensim/utils.py index c32c65f75d..cf5d6b6499 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -439,7 +439,7 @@ def add_lifecycle_event(self, event_name, log_level=logging.INFO, **event): self.lifecycle_events = [] if log_level: - logger.log(log_level, "recording lifecycle event %s", event_dict) + logger.log(log_level, "%s lifecycle event %s", self.__class__.__name__, event_dict) if self.lifecycle_events is not None: self.lifecycle_events.append(event_dict) From 73540881cf70b014cdaf9aa25194c0f3de8a1a5e Mon Sep 17 00:00:00 2001 From: Michael Penkov <m@penkov.dev> Date: Sun, 7 Mar 2021 22:20:31 +0900 Subject: [PATCH 11/12] Update word2vec.py remove unused import --- gensim/models/word2vec.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 5dbd405c07..77be91e5c6 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -192,7 +192,6 @@ import itertools import copy from queue import Queue, Empty -import time from numpy import float32 as REAL import numpy as np From 7bc92ea26afad113928dd942e16fbfd93e57c119 Mon Sep 17 00:00:00 2001 From: Michael Penkov <m@penkov.dev> Date: Sun, 7 Mar 2021 22:45:31 +0900 Subject: [PATCH 12/12] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6996ed2477..25737c095f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Changes ## Unreleased - fix RuntimeError in export_phrases (change defaultdict to dict) (PR [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041), [@thalishsajeed](https://github.com/thalishsajeed)) +- Record lifecycle events in Gensim models (PR [#3060](https://github.com/RaRe-Technologies/gensim/pull/3060), [@piskvorky](https://github.com/piskvorky)) ## 4.0.0beta, 2020-10-31