Skip to content

Commit

Permalink
add lifecycle events to remaining models
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Mar 4, 2021
1 parent 4adac6c commit 5e0e127
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 24 deletions.
4 changes: 4 additions & 0 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ def __init__(self, documents=None, prune_at=2000000):

if documents is not None:
self.add_documents(documents, prune_at=prune_at)
self.add_lifecycle_event(
"created",
msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)",
)

def __getitem__(self, tokenid):
"""Get the string token that corresponds to `tokenid`.
Expand Down
7 changes: 5 additions & 2 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ def _pad_ones(m, new_len):


def load_facebook_model(path, encoding='utf-8'):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file.
"""Load the model from Facebook's native fasttext `.bin` output file.
Notes
------
Expand Down Expand Up @@ -835,7 +835,10 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):

_check_model(model)

logger.info("loaded %s weight matrix for fastText model from %s", m.vectors_ngrams.shape, fin.name)
model.add_lifecycle_event(
"load_fasttext_format",
msg=f"loaded {m.vectors_ngrams.shape} weight matrix for fastText model from {fin.name}",
)
return model


Expand Down
4 changes: 1 addition & 3 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,6 @@ def _log_evaluate_word_analogies(section):
logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
return score


def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
"""Compute performance of the model on an analogy test set.
Expand Down Expand Up @@ -1923,7 +1922,7 @@ def _load_word2vec_format(

kv.add_lifecycle_event(
"load_word2vec_format",
msg=f"loaded {kv.vectors.shape} matrix of type {kv.dtype} from {fname}",
msg=f"loaded {kv.vectors.shape} matrix of type {kv.vectors.dtype} from {fname}",
binary=binary, encoding=encoding,
)
return kv
Expand Down Expand Up @@ -1951,7 +1950,6 @@ def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
"""Return a numpy array of the given shape. Reuse prior_vectors object or values
to extent possible. Initialize new values randomly if requested.
FIXME: NAME/DOCS CHANGES PRE-4.0.0 FOR #2955/#2975 MMAP & OTHER INITIALIZATION CLEANUP WORK.
"""
if prior_vectors is None:
prior_vectors = np.zeros((0, 0))
Expand Down
10 changes: 8 additions & 2 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,17 @@
import logging
import numbers
import os
import time
from collections import defaultdict

import numpy as np
from scipy.special import gammaln, psi # gamma function utils
from scipy.special import polygamma
from collections import defaultdict

from gensim import interfaces, utils, matutils
from gensim.matutils import (
kullback_leibler, hellinger, jaccard_distance, jensen_shannon,
dirichlet_expectation, logsumexp, mean_absolute_difference
dirichlet_expectation, logsumexp, mean_absolute_difference,
)
from gensim.models import basemodel, CoherenceModel
from gensim.models.callbacks import Callback
Expand Down Expand Up @@ -518,7 +519,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
# if a training corpus was provided, start estimating the model right away
if corpus is not None:
use_numpy = self.dispatcher is not None
start = time.time()
self.update(corpus, chunks_as_numpy=use_numpy)
self.add_lifecycle_event(
"created",
msg=f"trained {self} in {time.time() - start:.2f}s",
)

def init_dir_prior(self, prior, name):
"""Initialize priors for the Dirichlet distribution.
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype,
)

def update(self, corpus, chunks_as_numpy=False):
Expand Down
20 changes: 13 additions & 7 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

import logging
import sys
import time

import numpy as np
import scipy.linalg
Expand Down Expand Up @@ -351,17 +352,17 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""

def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
decay=1.0, distributed=False, onepass=True,
power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64):
"""Construct an `LsiModel` object.
Either `corpus` or `id2word` must be supplied in order to train the model.
def __init__(
self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
decay=1.0, distributed=False, onepass=True,
power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64
):
"""Build an LSI model.
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
Stream of document vectors or a sparse matrix of shape (`num_documents`, `num_terms`).
num_topics : int, optional
Number of requested factors (latent dimensions)
id2word : dict of {int: str}, optional
Expand Down Expand Up @@ -440,7 +441,12 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
raise RuntimeError("failed to initialize distributed LSI (%s)" % err)

if corpus is not None:
start = time.time()
self.add_documents(corpus)
self.add_lifecycle_event(
"created",
msg=f"trained {self} in {time.time() - start:.2f}s",
)

def add_documents(self, corpus, chunksize=None, decay=None):
"""Update model with new `corpus`.
Expand Down
6 changes: 5 additions & 1 deletion gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from math import log
import pickle
from inspect import getfullargspec as getargspec
import time

from gensim import utils, interfaces

Expand Down Expand Up @@ -566,7 +567,9 @@ def __init__(
raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able')

if sentences is not None:
start = time.time()
self.add_vocab(sentences)
self.add_lifecycle_event("created", msg=f"built {self} in {time.time() - start:.2f}s")

def __str__(self):
return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % (
Expand Down Expand Up @@ -772,8 +775,9 @@ def __init__(self, phrases_model):
self.scoring = phrases_model.scoring
self.connector_words = phrases_model.connector_words
logger.info('exporting phrases from %s', phrases_model)
start = time.time()
self.phrasegrams = phrases_model.export_phrases()
logger.info('exported %s', self)
self.add_lifecycle_event("created", msg=f"exported {self} from {phrases_model} in {time.time() - start:.2f}s")

def __str__(self):
return "%s<%i phrases, min_count=%s, threshold=%s>" % (
Expand Down
1 change: 1 addition & 0 deletions gensim/models/rpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(self, corpus, id2word=None, num_topics=300):
self.num_topics = num_topics
if corpus is not None:
self.initialize(corpus)
self.add_lifecycle_event("created", msg=f"created {self}")

def __str__(self):
return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)
Expand Down
11 changes: 7 additions & 4 deletions gensim/models/tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,11 +463,14 @@ def initialize(self, corpus):
self.dfs = dfs
self.term_lengths = None
# and finally compute the idf weights
logger.info(
"calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
self.num_docs, max(dfs.keys()) + 1 if dfs else 0, self.num_nnz
)
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
self.add_lifecycle_event(
"initialize",
msg=(
f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}"
f" features ({self.num_nnz} matrix non-zeros)"
),
)

def __getitem__(self, bow, eps=1e-12):
"""Get the tf-idf representation of an input vector and/or corpus.
Expand Down
11 changes: 7 additions & 4 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@
import itertools
import copy
from queue import Queue, Empty
import time

from numpy import float32 as REAL
import numpy as np
Expand Down Expand Up @@ -1053,6 +1054,7 @@ def train(
raw_word_count = 0
start = default_timer() - 0.00001
job_tally = 0
start = time.time()

for cur_epoch in range(self.epochs):
for callback in callbacks:
Expand Down Expand Up @@ -1084,6 +1086,7 @@ def train(

for callback in callbacks:
callback.on_train_end(self)

return trained_word_count, raw_word_count

def _worker_loop_corpusfile(
Expand Down Expand Up @@ -1662,10 +1665,10 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_
Total number of jobs processed during training.
"""
logger.info(
"training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed
)
self.add_lifecycle_event("train", msg=(
f"training on {raw_word_count} raw words ({trained_word_count} effective words) "
f"took {total_elapsed:.1f}s, {trained_word_count / total_elapsed:.0f} effective words/s"
))

def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1):
"""Score the log probability for a sequence of sentences.
Expand Down

0 comments on commit 5e0e127

Please sign in to comment.