Improve the way reduced_windows is passed around and used.

piskvorky · Jun 10, 2021 · 6a93037 · 6a93037
1 parent 6afb3ff
commit 6a93037
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 43 deletions.
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -223,13 +223,13 @@
 
     def train_epoch_sg(
             model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
-            _work, _neu1, compute_loss, reduced_windows,
+            _work, _neu1, compute_loss,
         ):
         raise RuntimeError("Training with corpus_file argument is not supported")
 
     def train_epoch_cbow(
             model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
-            _work, _neu1, compute_loss, reduced_windows,
+            _work, _neu1, compute_loss,
         ):
         raise RuntimeError("Training with corpus_file argument is not supported")
 
@@ -240,7 +240,7 @@ def __init__(
             max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
             sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
             trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
-            comment=None, max_final_vocab=None, reduced_windows=True
+            comment=None, max_final_vocab=None, reduced_windows=True,
         ):
         """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
 
@@ -346,8 +346,10 @@ def __init__(
         callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
             Sequence of callbacks to be executed at specific stages during training.
         reduced_windows : bool, optional
-            If True, the window size is uniformly sampled from {1, `window`}
-            during training. Otherwise, it is fixed to `window`.
+            If True, the effective window size is uniformly sampled from  [1, `window`]
+            for each target word during training, to match the original word2vec algorithm's
+            approximate weighting of context words by distance. Otherwise, the effective
+            window size is always fixed to `window` words to either side.
 
         Examples
         --------
@@ -916,13 +918,13 @@ def _do_train_epoch(
             examples, tally, raw_tally = train_epoch_sg(
                 self, corpus_file, offset, cython_vocab, cur_epoch,
                 total_examples, total_words, work, neu1,
-                self.compute_loss, self.reduced_windows,
+                self.compute_loss,
             )
         else:
             examples, tally, raw_tally = train_epoch_cbow(
                 self, corpus_file, offset, cython_vocab, cur_epoch,
                 total_examples, total_words, work, neu1,
-                self.compute_loss, self.reduced_windows,
+                self.compute_loss,
             )
 
         return examples, tally, raw_tally
@@ -950,12 +952,12 @@ def _do_train_job(self, sentences, alpha, inits):
         if self.sg:
             tally += train_batch_sg(
                 self, sentences, alpha, work,
-                self.compute_loss, self.reduced_windows,
+                self.compute_loss,
             )
         else:
             tally += train_batch_cbow(
                 self, sentences, alpha, work, neu1,
-                self.compute_loss, self.reduced_windows,
+                self.compute_loss,
             )
         return tally, self._raw_word_count(sentences)
 
@@ -967,7 +969,7 @@ def train(
             self, corpus_iterable=None, corpus_file=None, total_examples=None,
             total_words=None, epochs=None, start_alpha=None, end_alpha=None,
             word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False,
-            reduced_windows=True, callbacks=(), **kwargs,
+            reduced_windows=None, callbacks=(), **kwargs,
         ):
         """Update the model's neural weights from a sequence of sentences.
 
@@ -1025,8 +1027,10 @@ def train(
             If True, computes and stores loss value which can be retrieved using
             :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
         reduced_windows : bool, optional
-            If True, the window size is uniformly sampled from {1, `window`}
-            during training. Otherwise, it is fixed to `window`.
+            If True, the effective window size is uniformly sampled from  [1, `window`]
+            for each target word during training, to match the original word2vec algorithm's
+            approximate weighting of context words by distance. Otherwise, the effective
+            window size is always fixed to `window` words to either side.
         callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
             Sequence of callbacks to be executed at specific stages during training.
 
@@ -1046,7 +1050,8 @@ def train(
         self.alpha = start_alpha or self.alpha
         self.min_alpha = end_alpha or self.min_alpha
         self.epochs = epochs
-        self.reduced_windows = reduced_windows
+        if reduced_windows is not None:
+            self.reduced_windows = bool(reduced_windows)
 
         self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words)
         self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs)

diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx
@@ -187,7 +187,7 @@ cdef void prepare_c_structures_for_batch(
         int *effective_words, int *effective_sentences, unsigned long long *next_random,
         cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens,
         np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows,
-        bint do_reduced_windows) nogil:
+        int do_reduced_windows) nogil:
     cdef VocabItem word
     cdef string token
     cdef vector[string] sent
@@ -229,7 +229,7 @@ cdef void prepare_c_structures_for_batch(
         if do_reduced_windows:
             reduced_windows[i] = random_int32(next_random) % window
         else:
-            reduced_windows[i] = window
+            reduced_windows[i] = 0
 
 
 cdef REAL_t get_alpha(REAL_t alpha, REAL_t end_alpha, int cur_epoch, int num_epochs) nogil:
@@ -254,7 +254,7 @@ cdef REAL_t get_next_alpha(
 
 
 def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work,
-                   _neu1, compute_loss, reduced_windows):
+                   _neu1, compute_loss,):
     """Train Skipgram model for one epoch by training on an input stream. This function is used only in multistream mode.
 
     Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
@@ -273,9 +273,6 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
         Private working memory for each worker.
     compute_loss : bool
         Whether or not the training loss should be computed in this batch.
-    reduced_windows : bool
-        Whether or not the window size should be reduced based on random
-        uniform sampling.
 
     Returns
     -------
@@ -302,7 +299,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
     cdef long long total_sentences = 0
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
-    cdef bint do_reduced_windows = reduced_windows
+    cdef int do_reduced_windows = int(model.reduced_windows)
 
     init_w2v_config(&c, model, _alpha, compute_loss, _work)
 
@@ -358,7 +355,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec
 
 
 def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, _work,
-                     _neu1, compute_loss, reduced_windows):
+                     _neu1, compute_loss,):
     """Train CBOW model for one epoch by training on an input stream. This function is used only in multistream mode.
 
     Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
@@ -377,9 +374,6 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
         Private working memory for each worker.
     compute_loss : bool
         Whether or not the training loss should be computed in this batch.
-    reduced_windows : bool
-        Whether or not the window size should be reduced based on random
-        uniform sampling.
 
     Returns
     -------
@@ -406,7 +400,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
     cdef long long total_sentences = 0
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
-    cdef bint do_reduced_windows = reduced_windows
+    cdef int do_reduced_windows = int(model.reduced_windows)
 
     init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1)
 

diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx
@@ -502,7 +502,7 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1
         c[0].neu1 = <REAL_t *>np.PyArray_DATA(_neu1)
 
 
-def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows):
+def train_batch_sg(model, sentences, alpha, _work, compute_loss):
     """Update skip-gram model by training on a batch of sentences.
 
     Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
@@ -519,9 +519,6 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows
         Private working memory for each worker.
     compute_loss : bool
         Whether or not the training loss should be computed in this batch.
-    reduced_windows : bool
-        Whether or not the window size should be reduced based on random
-        uniform sampling.
 
     Returns
     -------
@@ -573,12 +570,11 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows
             break  # TODO: log warning, tally overflow?
 
     # precompute "reduced window" offsets in a single randint() call
-    if reduced_windows:
-        window_size = model.random.randint(0, c.window, effective_words)
+    if model.reduced_windows:
+        for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
+            c.reduced_windows[i] = item
     else:
-        window_size = [0] * effective_words
-    for i, item in enumerate(window_size):
-        c.reduced_windows[i] = item
+        c.reduced_windows[:] = 0
 
     # release GIL & train on all sentences
     with nogil:
@@ -604,7 +600,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss, reduced_windows
     return effective_words
 
 
-def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduced_windows):
+def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):
     """Update CBOW model by training on a batch of sentences.
 
     Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
@@ -623,9 +619,6 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduce
         Private working memory for each worker.
     compute_loss : bool
         Whether or not the training loss should be computed in this batch.
-    reduced_windows : bool
-        Whether or not the window size should be reduced based on random
-        uniform sampling.
 
     Returns
     -------
@@ -676,12 +669,11 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss, reduce
             break  # TODO: log warning, tally overflow?
 
     # precompute "reduced window" offsets in a single randint() call
-    if reduced_windows:
-        window_size = model.random.randint(0, c.window, effective_words)
+    if model.reduced_windows:
+        for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
+            c.reduced_windows[i] = item
     else:
-        window_size = [0] * effective_words
-    for i, item in enumerate(window_size):
-        c.reduced_windows[i] = item
+        c.reduced_windows[:] = int(0)
 
     # release GIL & train on all sentences
     with nogil: