Skip to content

Commit

Permalink
Improve the way reduced_windows is passed around and used.
Browse files Browse the repository at this point in the history
  • Loading branch information
pandrey-fr committed Jun 10, 2021
1 parent 6afb3ff commit 6a93037
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 43 deletions.
31 changes: 18 additions & 13 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,13 @@

def train_epoch_sg(
model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
_work, _neu1, compute_loss, reduced_windows,
_work, _neu1, compute_loss,
):
raise RuntimeError("Training with corpus_file argument is not supported")

def train_epoch_cbow(
model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
_work, _neu1, compute_loss, reduced_windows,
_work, _neu1, compute_loss,
):
raise RuntimeError("Training with corpus_file argument is not supported")

Expand All @@ -240,7 +240,7 @@ def __init__(
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
comment=None, max_final_vocab=None, reduced_windows=True
comment=None, max_final_vocab=None, reduced_windows=True,
):
"""Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
Expand Down Expand Up @@ -346,8 +346,10 @@ def __init__(
callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
Sequence of callbacks to be executed at specific stages during training.
reduced_windows : bool, optional
If True, the window size is uniformly sampled from {1, `window`}
during training. Otherwise, it is fixed to `window`.
If True, the effective window size is uniformly sampled from [1, `window`]
for each target word during training, to match the original word2vec algorithm's
approximate weighting of context words by distance. Otherwise, the effective
window size is always fixed to `window` words to either side.
Examples
--------
Expand Down Expand Up @@ -916,13 +918,13 @@ def _do_train_epoch(
examples, tally, raw_tally = train_epoch_sg(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1,
self.compute_loss, self.reduced_windows,
self.compute_loss,
)
else:
examples, tally, raw_tally = train_epoch_cbow(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1,
self.compute_loss, self.reduced_windows,
self.compute_loss,
)

return examples, tally, raw_tally
Expand Down Expand Up @@ -950,12 +952,12 @@ def _do_train_job(self, sentences, alpha, inits):
if self.sg:
tally += train_batch_sg(
self, sentences, alpha, work,
self.compute_loss, self.reduced_windows,
self.compute_loss,
)
else:
tally += train_batch_cbow(
self, sentences, alpha, work, neu1,
self.compute_loss, self.reduced_windows,
self.compute_loss,
)
return tally, self._raw_word_count(sentences)

Expand All @@ -967,7 +969,7 @@ def train(
self, corpus_iterable=None, corpus_file=None, total_examples=None,
total_words=None, epochs=None, start_alpha=None, end_alpha=None,
word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False,
reduced_windows=True, callbacks=(), **kwargs,
reduced_windows=None, callbacks=(), **kwargs,
):
"""Update the model's neural weights from a sequence of sentences.
Expand Down Expand Up @@ -1025,8 +1027,10 @@ def train(
If True, computes and stores loss value which can be retrieved using
:meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
reduced_windows : bool, optional
If True, the window size is uniformly sampled from {1, `window`}
during training. Otherwise, it is fixed to `window`.
If True, the effective window size is uniformly sampled from [1, `window`]
for each target word during training, to match the original word2vec algorithm's
approximate weighting of context words by distance. Otherwise, the effective
window size is always fixed to `window` words to either side.
callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
Sequence of callbacks to be executed at specific stages during training.
Expand All @@ -1046,7 +1050,8 @@ def train(
self.alpha = start_alpha or self.alpha
self.min_alpha = end_alpha or self.min_alpha
self.epochs = epochs
self.reduced_windows = reduced_windows
if reduced_windows is not None:
self.reduced_windows = bool(reduced_windows)

self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words)
self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs)
Expand Down
18 changes: 6 additions & 12 deletions gensim/models/word2vec_corpusfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ cdef void prepare_c_structures_for_batch(
int *effective_words, int *effective_sentences, unsigned long long *next_random,
cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens,
np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows,
bint do_reduced_windows) nogil:
int do_reduced_windows) nogil:
cdef VocabItem word
cdef string token
cdef vector[string] sent
Expand Down Expand Up @@ -229,7 +229,7 @@ cdef void prepare_c_structures_for_batch(
if do_reduced_windows:
reduced_windows[i] = random_int32(next_random) % window
else:
reduced_windows[i] = window
reduced_windows[i] = 0


cdef REAL_t get_alpha(REAL_t alpha, REAL_t end_alpha, int cur_epoch, int num_epochs) nogil:
Expand All @@ -254,7 +254,7 @@ cdef REAL_t get_next_alpha(


def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work,
_neu1, compute_loss, reduced_windows):
_neu1, compute_loss,):
"""Train Skipgram model for one epoch by training on an input stream. This function is used only in multistream mode.
Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -273,9 +273,6 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.
Returns
-------
Expand All @@ -302,7 +299,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
cdef long long total_sentences = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef bint do_reduced_windows = reduced_windows
cdef int do_reduced_windows = int(model.reduced_windows)

init_w2v_config(&c, model, _alpha, compute_loss, _work)

Expand Down Expand Up @@ -358,7 +355,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec


def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work,
_neu1, compute_loss, reduced_windows):
_neu1, compute_loss,):
"""Train CBOW model for one epoch by training on an input stream. This function is used only in multistream mode.
Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -377,9 +374,6 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.
Returns
-------
Expand All @@ -406,7 +400,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
cdef long long total_sentences = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef bint do_reduced_windows = reduced_windows
cdef int do_reduced_windows = int(model.reduced_windows)

init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1)

Expand Down
28 changes: 10 additions & 18 deletions gensim/models/word2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1
c[0].neu1 = <REAL_t *>np.PyArray_DATA(_neu1)


def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows):
def train_batch_sg(model, sentences, alpha, _work, compute_loss):
"""Update skip-gram model by training on a batch of sentences.
Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -519,9 +519,6 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.
Returns
-------
Expand Down Expand Up @@ -573,12 +570,11 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows
break # TODO: log warning, tally overflow?

# precompute "reduced window" offsets in a single randint() call
if reduced_windows:
window_size = model.random.randint(0, c.window, effective_words)
if model.reduced_windows:
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
c.reduced_windows[i] = item
else:
window_size = [0] * effective_words
for i, item in enumerate(window_size):
c.reduced_windows[i] = item
c.reduced_windows[:] = 0

# release GIL & train on all sentences
with nogil:
Expand All @@ -604,7 +600,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows
return effective_words


def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduced_windows):
def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):
"""Update CBOW model by training on a batch of sentences.
Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Expand All @@ -623,9 +619,6 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduce
Private working memory for each worker.
compute_loss : bool
Whether or not the training loss should be computed in this batch.
reduced_windows : bool
Whether or not the window size should be reduced based on random
uniform sampling.
Returns
-------
Expand Down Expand Up @@ -676,12 +669,11 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduce
break # TODO: log warning, tally overflow?

# precompute "reduced window" offsets in a single randint() call
if reduced_windows:
window_size = model.random.randint(0, c.window, effective_words)
if model.reduced_windows:
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
c.reduced_windows[i] = item
else:
window_size = [0] * effective_words
for i, item in enumerate(window_size):
c.reduced_windows[i] = item
c.reduced_windows[:] = int(0)

# release GIL & train on all sentences
with nogil:
Expand Down

0 comments on commit 6a93037

Please sign in to comment.