piskvorky · mpenkov · Jun 29, 2021 · Jun 7, 2021 · Jun 10, 2021 · Jun 10, 2021
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -158,7 +158,7 @@ def count(self, new_val):
 class Doc2Vec(Word2Vec):
     def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
                  dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(),
-                 window=5, epochs=10, **kwargs):
+                 window=5, epochs=10, shrink_windows=True, **kwargs):
         """Class for training, using and evaluating neural networks described in
         `Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.
 
@@ -248,6 +248,11 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No
 
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
             List of callbacks that need to be executed/run at specific stages during training.
+        shrink_windows : bool, optional
+            If True, the effective window size is uniformly sampled from  [1, `window`]
+            for each target word during training, to match the original word2vec algorithm's
+            approximate weighting of context words by distance. Otherwise, the effective
+            window size is always fixed to `window` words to either side.
 
         Some important internal attributes are the following:
 
@@ -294,6 +299,7 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No
             callbacks=callbacks,
             window=window,
             epochs=epochs,
+            shrink_windows=shrink_windows,
             **kwargs,
         )
 

diff --git a/gensim/models/doc2vec_corpusfile.pyx b/gensim/models/doc2vec_corpusfile.pyx
@@ -59,7 +59,7 @@ cdef void prepare_c_structures_for_batch(
         int *effective_words, unsigned long long *next_random, cvocab_t *vocab,
         np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
         np.uint32_t *reduced_windows, int *document_len, int train_words,
-        int docvecs_count, int doc_tag,
+        int docvecs_count, int doc_tag, int shrink_windows,
     ) nogil:
     cdef VocabItem predict_word
     cdef string token
@@ -87,8 +87,12 @@ cdef void prepare_c_structures_for_batch(
     document_len[0] = i
 
     if train_words and reduced_windows != NULL:
-        for i in range(document_len[0]):
-            reduced_windows[i] = random_int32(next_random) % window
+        if shrink_windows:
+            for i in range(document_len[0]):
+                reduced_windows[i] = random_int32(next_random) % window
+        else:
+            for i in range(document_len[0]):
+                reduced_windows[i] = 0
 
     if doc_tag < docvecs_count:
         effective_words[0] += 1
@@ -160,6 +164,7 @@ def d2v_train_epoch_dbow(
     cdef long long total_documents = 0
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
+    cdef int shrink_windows = int(model.shrink_windows)
 
     cdef vector[string] doc_words
     cdef long long _doc_tag = start_doctag
@@ -183,7 +188,7 @@ def d2v_train_epoch_dbow(
             prepare_c_structures_for_batch(
                 doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
                 &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens,  c.codes, c.points,
-                c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag)
+                c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag, shrink_windows)
 
             for i in range(document_len):
                 if c.train_words:  # simultaneous skip-gram wordvec-training
@@ -300,6 +305,7 @@ def d2v_train_epoch_dm(
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
     cdef REAL_t count, inv_count = 1.0
+    cdef int shrink_windows = int(model.shrink_windows)
 
     cdef vector[string] doc_words
     cdef long long _doc_tag = start_doctag
@@ -323,7 +329,7 @@ def d2v_train_epoch_dm(
             prepare_c_structures_for_batch(
                 doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random,
                 vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, c.reduced_windows,
-                &document_len, c.train_words, c.docvecs_count, _doc_tag)
+                &document_len, c.train_words, c.docvecs_count, _doc_tag, shrink_windows)
 
             for i in range(document_len):
                 j = i - c.window + c.reduced_windows[i]
@@ -453,6 +459,7 @@ def d2v_train_epoch_dm_concat(
     cdef long long total_documents = 0
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
+    cdef int shrink_windows = int(model.shrink_windows)
 
     cdef vector[string] doc_words
     cdef long long _doc_tag = start_doctag
@@ -490,7 +497,8 @@ def d2v_train_epoch_dm_concat(
             prepare_c_structures_for_batch(
                 doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
                 &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes,
-                c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag)
+                c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag,
+                shrink_windows)
 
             for i in range(document_len):
                 j = i - c.window      # negative OK: will pad with null word

diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx
@@ -365,8 +365,12 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
 
     if c.train_words:
         # single randint() call avoids a big thread-synchronization slowdown
-        for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
-            c.reduced_windows[i] = item
+        if model.shrink_windows:
+            for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
+                c.reduced_windows[i] = item
+        else:
+            for i in range(c.document_len):
+                c.reduced_windows[i] = 0
 
     for i in range(c.doctag_len):
         c.doctag_indexes[i] = doctag_indexes[i]
@@ -497,8 +501,12 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
     c.document_len = i
 
     # single randint() call avoids a big thread-sync slowdown
-    for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
-        c.reduced_windows[i] = item
+    if model.shrink_windows:
+        for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
+            c.reduced_windows[i] = item
+    else:
+        for i in range(c.document_len):
+            c.reduced_windows[i] = 0
 
     for i in range(c.doctag_len):
         c.doctag_indexes[i] = doctag_indexes[i]

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -276,7 +276,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
                  max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                  negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
                  sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
-                 max_final_vocab=None):
+                 max_final_vocab=None, shrink_windows=True,):
         """Train, use and evaluate word representations learned using the method
         described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_,
         aka FastText.
@@ -385,6 +385,11 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
             ``min_count```.  If the specified ``min_count`` is more than the
             automatically calculated ``min_count``, the former will be used.
             Set to ``None`` if not required.
+        shrink_windows : bool, optional
+            If True, the effective window size is uniformly sampled from  [1, `window`]
+            for each target word during training, to match the original word2vec algorithm's
+            approximate weighting of context words by distance. Otherwise, the effective
+            window size is always fixed to `window` words to either side.
 
         Examples
         --------
@@ -432,7 +437,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
             max_vocab_size=max_vocab_size, max_final_vocab=max_final_vocab,
             min_count=min_count, sample=sample, sorted_vocab=sorted_vocab,
             null_word=null_word, ns_exponent=ns_exponent, hashfxn=hashfxn,
-            seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha)
+            seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean,
+            min_alpha=min_alpha, shrink_windows=shrink_windows)
 
     def _init_post_load(self, hidden_output):
         num_vectors = len(self.wv.vectors)

diff --git a/gensim/models/fasttext_corpusfile.pyx b/gensim/models/fasttext_corpusfile.pyx
@@ -46,7 +46,8 @@ cdef void prepare_c_structures_for_batch(
         vector[vector[string]] &sentences, int sample, int hs, int window, long long *total_words,
         int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab,
         int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
-        np.uint32_t *reduced_windows, int *subwords_idx_len, np.uint32_t **subwords_idx) nogil:
+        np.uint32_t *reduced_windows, int *subwords_idx_len, np.uint32_t **subwords_idx, int shrink_windows,
+    ) nogil:
     cdef VocabItem word
     cdef string token
     cdef vector[string] sent
@@ -88,8 +89,12 @@ cdef void prepare_c_structures_for_batch(
             break
 
     # precompute "reduced window" offsets in a single randint() call
-    for i in range(effective_words[0]):
-        reduced_windows[i] = random_int32(next_random) % window
+    if shrink_windows:
+        for i in range(effective_words[0]):
+            reduced_windows[i] = random_int32(next_random) % window
+    else:
+        for i in range(effective_words[0]):
+            reduced_windows[i] = 0
 
 
 def train_epoch_sg(
@@ -136,6 +141,7 @@ def train_epoch_sg(
     cdef long long total_sentences = 0
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
+    cdef int shrink_windows = int(model.shrink_windows)
 
     init_ft_config(&c, model, _alpha, _work, _l1)
 
@@ -153,7 +159,7 @@ def train_epoch_sg(
             prepare_c_structures_for_batch(
                 sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences,
                 &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, c.codelens,
-                c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx)
+                c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx, shrink_windows)
 
             for sent_idx in range(effective_sentences):
                 idx_start = c.sentence_idx[sent_idx]
@@ -226,6 +232,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
     cdef long long total_sentences = 0
     cdef long long total_effective_words = 0, total_words = 0
     cdef int sent_idx, idx_start, idx_end
+    cdef int shrink_windows = int(model.shrink_windows)
 
     init_ft_config(&c, model, _alpha, _work, _neu1)
 
@@ -243,7 +250,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
             prepare_c_structures_for_batch(
                 sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences,
                 &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, c.codelens,
-                c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx)
+                c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx, shrink_windows)
 
             for sent_idx in range(effective_sentences):
                 idx_start = c.sentence_idx[sent_idx]

diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx
@@ -601,8 +601,12 @@ def train_batch_any(model, sentences, alpha, _work, _neu1):
     num_words, num_sentences = populate_ft_config(&c, model.wv, model.wv.buckets_word, sentences)
 
     # precompute "reduced window" offsets in a single randint() call
-    for i, randint in enumerate(model.random.randint(0, c.window, num_words)):
-        c.reduced_windows[i] = randint
+    if model.shrink_windows:
+        for i, randint in enumerate(model.random.randint(0, c.window, num_words)):
+            c.reduced_windows[i] = randint
+    else:
+        for i in range(num_words):
+            c.reduced_windows[i] = 0
 
     # release GIL & train on all sentences in the batch
     with nogil:

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -240,7 +240,7 @@ def __init__(
             max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
             sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
             trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
-            comment=None, max_final_vocab=None,
+            comment=None, max_final_vocab=None, shrink_windows=True,
         ):
         """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
 
@@ -345,6 +345,11 @@ def __init__(
             :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
         callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
             Sequence of callbacks to be executed at specific stages during training.
+        shrink_windows : bool, optional
+            If True, the effective window size is uniformly sampled from  [1, `window`]
+            for each target word during training, to match the original word2vec algorithm's
+            approximate weighting of context words by distance. Otherwise, the effective
+            window size is always fixed to `window` words to either side.
 
         Examples
         --------
@@ -377,6 +382,7 @@ def __init__(
         self.min_alpha = float(min_alpha)
 
         self.window = int(window)
+        self.shrink_windows = bool(shrink_windows)
         self.random = np.random.RandomState(seed)
 
         self.hs = int(hs)
@@ -910,12 +916,14 @@ def _do_train_epoch(
         if self.sg:
             examples, tally, raw_tally = train_epoch_sg(
                 self, corpus_file, offset, cython_vocab, cur_epoch,
-                total_examples, total_words, work, neu1, self.compute_loss,
+                total_examples, total_words, work, neu1,
+                self.compute_loss,
             )
         else:
             examples, tally, raw_tally = train_epoch_cbow(
                 self, corpus_file, offset, cython_vocab, cur_epoch,
-                total_examples, total_words, work, neu1, self.compute_loss,
+                total_examples, total_words, work, neu1,
+                self.compute_loss,
             )
 
         return examples, tally, raw_tally
@@ -941,20 +949,26 @@ def _do_train_job(self, sentences, alpha, inits):
         work, neu1 = inits
         tally = 0
         if self.sg:
-            tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
+            tally += train_batch_sg(
+                self, sentences, alpha, work,
+                self.compute_loss,
+            )
         else:
-            tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
+            tally += train_batch_cbow(
+                self, sentences, alpha, work, neu1,
+                self.compute_loss,
+            )
         return tally, self._raw_word_count(sentences)
 
     def _clear_post_train(self):
         """Clear any cached values that training may have invalidated."""
         self.wv.norms = None
 
     def train(
-            self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None,
-            epochs=None, start_alpha=None, end_alpha=None, word_count=0,
-            queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(),
-            **kwargs,
+            self, corpus_iterable=None, corpus_file=None, total_examples=None,
+            total_words=None, epochs=None, start_alpha=None, end_alpha=None,
+            word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False,
+            callbacks=(), **kwargs,
         ):
         """Update the model's neural weights from a sequence of sentences.
 
@@ -1039,7 +1053,7 @@ def train(
             msg=(
                 f"training model with {self.workers} workers on {len(self.wv)} vocabulary and "
                 f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} "
-                f"negative={self.negative} window={self.window}"
+                f"negative={self.negative} window={self.window} shrink_windows={self.shrink_windows}"
             ),
         )
 
@@ -1970,6 +1984,8 @@ def _load_specials(self, *args, **kwargs):
                 self.syn1 = self.syn1
                 del self.syn1
             del self.trainables
+        if not hasattr(self, 'shrink_windows'):
+            self.shrink_windows = True
 
     def get_latest_training_loss(self):
         """Get current value of the training loss.