diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 53a02d80fe..aa245c369a 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -240,7 +240,7 @@ def __init__( max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None, reduced_windows=True, + comment=None, max_final_vocab=None, shrink_windows=True, ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. @@ -345,7 +345,7 @@ def __init__( :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional Sequence of callbacks to be executed at specific stages during training. - reduced_windows : bool, optional + shrink_windows : bool, optional If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective @@ -382,7 +382,7 @@ def __init__( self.min_alpha = float(min_alpha) self.window = int(window) - self.reduced_windows = bool(reduced_windows) + self.shrink_windows = bool(shrink_windows) self.random = np.random.RandomState(seed) self.hs = int(hs) @@ -426,7 +426,7 @@ def __init__( corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks, - reduced_windows=self.reduced_windows) + shrink_windows=self.shrink_windows) else: if trim_rule is not None: logger.warning( @@ -969,7 +969,7 @@ def train( self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, - reduced_windows=None, callbacks=(), **kwargs, + shrink_windows=None, callbacks=(), **kwargs, ): """Update the model's neural weights from a sequence of sentences. @@ -1026,7 +1026,7 @@ def train( compute_loss: bool, optional If True, computes and stores loss value which can be retrieved using :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. - reduced_windows : bool, optional + shrink_windows : bool, optional If True, the effective window size is uniformly sampled from [1, `window`] for each target word during training, to match the original word2vec algorithm's approximate weighting of context words by distance. Otherwise, the effective @@ -1050,8 +1050,8 @@ def train( self.alpha = start_alpha or self.alpha self.min_alpha = end_alpha or self.min_alpha self.epochs = epochs - if reduced_windows is not None: - self.reduced_windows = bool(reduced_windows) + if shrink_windows is not None: + self.shrink_windows = bool(shrink_windows) self._check_training_sanity(epochs=epochs, total_examples=total_examples, total_words=total_words) self._check_corpus_sanity(corpus_iterable=corpus_iterable, corpus_file=corpus_file, passes=epochs) @@ -1061,7 +1061,7 @@ def train( msg=( f"training model with {self.workers} workers on {len(self.wv)} vocabulary and " f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} " - f"negative={self.negative} window={self.window} reduced_windows={self.reduced_windows}" + f"negative={self.negative} window={self.window} shrink_windows={self.shrink_windows}" ), ) diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index 2af398e58b..da94e78ec1 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -187,7 +187,7 @@ cdef void prepare_c_structures_for_batch( int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab, int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points, np.uint32_t *reduced_windows, - int do_reduced_windows) nogil: + int shrink_windows) nogil: cdef VocabItem word cdef string token cdef vector[string] sent @@ -226,7 +226,7 @@ cdef void prepare_c_structures_for_batch( # precompute "reduced window" offsets in a single randint() call for i in range(effective_words[0]): - if do_reduced_windows: + if shrink_windows: reduced_windows[i] = random_int32(next_random) % window else: reduced_windows[i] = 0 @@ -299,7 +299,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end - cdef int do_reduced_windows = int(model.reduced_windows) + cdef int shrink_windows = int(model.shrink_windows) init_w2v_config(&c, model, _alpha, compute_loss, _work) @@ -316,7 +316,7 @@ def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expec prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, - c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows) + c.codelens, c.codes, c.points, c.reduced_windows, shrink_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] @@ -400,7 +400,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp cdef long long total_sentences = 0 cdef long long total_effective_words = 0, total_words = 0 cdef int sent_idx, idx_start, idx_end - cdef int do_reduced_windows = int(model.reduced_windows) + cdef int shrink_windows = int(model.shrink_windows) init_w2v_config(&c, model, _alpha, compute_loss, _work, _neu1) @@ -417,7 +417,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp prepare_c_structures_for_batch( sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences, &c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, - c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, do_reduced_windows) + c.indexes, c.codelens, c.codes, c.points, c.reduced_windows, shrink_windows) for sent_idx in range(effective_sentences): idx_start = c.sentence_idx[sent_idx] diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 9662931479..7510e4dee2 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -570,7 +570,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - if model.reduced_windows: + if model.shrink_windows: for i, item in enumerate(model.random.randint(0, c.window, effective_words)): c.reduced_windows[i] = item else: @@ -669,7 +669,7 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - if model.reduced_windows: + if model.shrink_windows: for i, item in enumerate(model.random.randint(0, c.window, effective_words)): c.reduced_windows[i] = item else: