Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement shrink_windows argument for Word2Vec. #3169

Merged
merged 14 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def count(self, new_val):
class Doc2Vec(Word2Vec):
def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0,
dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(),
window=5, epochs=10, **kwargs):
window=5, epochs=10, shrink_windows=True, **kwargs):
"""Class for training, using and evaluating neural networks described in
`Distributed Representations of Sentences and Documents <http://arxiv.org/abs/1405.4053v2>`_.

Expand Down Expand Up @@ -248,6 +248,11 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No

callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
shrink_windows : bool, optional
If True, the effective window size is uniformly sampled from [1, `window`]
for each target word during training, to match the original word2vec algorithm's
approximate weighting of context words by distance. Otherwise, the effective
window size is always fixed to `window` words to either side.

Some important internal attributes are the following:

Expand Down Expand Up @@ -294,6 +299,7 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No
callbacks=callbacks,
window=window,
epochs=epochs,
shrink_windows=shrink_windows,
**kwargs,
)

Expand Down
20 changes: 14 additions & 6 deletions gensim/models/doc2vec_corpusfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ cdef void prepare_c_structures_for_batch(
int *effective_words, unsigned long long *next_random, cvocab_t *vocab,
np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
np.uint32_t *reduced_windows, int *document_len, int train_words,
int docvecs_count, int doc_tag,
int docvecs_count, int doc_tag, int shrink_windows,
) nogil:
cdef VocabItem predict_word
cdef string token
Expand Down Expand Up @@ -87,8 +87,12 @@ cdef void prepare_c_structures_for_batch(
document_len[0] = i

if train_words and reduced_windows != NULL:
for i in range(document_len[0]):
reduced_windows[i] = random_int32(next_random) % window
if shrink_windows:
for i in range(document_len[0]):
reduced_windows[i] = random_int32(next_random) % window
else:
for i in range(document_len[0]):
reduced_windows[i] = 0

if doc_tag < docvecs_count:
effective_words[0] += 1
Expand Down Expand Up @@ -160,6 +164,7 @@ def d2v_train_epoch_dbow(
cdef long long total_documents = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef int shrink_windows = int(model.shrink_windows)

cdef vector[string] doc_words
cdef long long _doc_tag = start_doctag
Expand All @@ -183,7 +188,7 @@ def d2v_train_epoch_dbow(
prepare_c_structures_for_batch(
doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
&c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points,
c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag)
c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag, shrink_windows)

for i in range(document_len):
if c.train_words: # simultaneous skip-gram wordvec-training
Expand Down Expand Up @@ -300,6 +305,7 @@ def d2v_train_epoch_dm(
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef REAL_t count, inv_count = 1.0
cdef int shrink_windows = int(model.shrink_windows)

cdef vector[string] doc_words
cdef long long _doc_tag = start_doctag
Expand All @@ -323,7 +329,7 @@ def d2v_train_epoch_dm(
prepare_c_structures_for_batch(
doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random,
vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, c.reduced_windows,
&document_len, c.train_words, c.docvecs_count, _doc_tag)
&document_len, c.train_words, c.docvecs_count, _doc_tag, shrink_windows)

for i in range(document_len):
j = i - c.window + c.reduced_windows[i]
Expand Down Expand Up @@ -453,6 +459,7 @@ def d2v_train_epoch_dm_concat(
cdef long long total_documents = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef int shrink_windows = int(model.shrink_windows)

cdef vector[string] doc_words
cdef long long _doc_tag = start_doctag
Expand Down Expand Up @@ -490,7 +497,8 @@ def d2v_train_epoch_dm_concat(
prepare_c_structures_for_batch(
doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
&c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes,
c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag)
c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag,
shrink_windows)

for i in range(document_len):
j = i - c.window # negative OK: will pad with null word
Expand Down
16 changes: 12 additions & 4 deletions gensim/models/doc2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,12 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,

if c.train_words:
# single randint() call avoids a big thread-synchronization slowdown
for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
c.reduced_windows[i] = item
if model.shrink_windows:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit repetitive.

Wouldn't it be better to define a model.initialize_reduced_windows method that does this, and call it when needed?

def initialize_reduced_windows(self, ...):
  if model.shrink_windows:
    for i, item in ...:
        ...
  else:
    ...

That way, if we find a logic error in the initialization code, we don't have to remember to fix it in a half a dozen other places (e.g. https://github.com/RaRe-Technologies/gensim/pull/3169/files#r649243075).

We should probably also do a bounds check on c.reduced_windows to ensure that it's at least document_len elements long.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you on the potential for refactored code that would ease future maintenance and updates. That being said, I actually built on the existing, repetitive code, so in my humble opinion this could be handled as a distinct PR (this one would add a feature, while a second one would enhance the validated, working code).

I also think that your suggestion of a model.initialize_reduced_windows is not entirely suitable here:
If I am not mistaken, in this code, c is not model: it is a special C structure (of a different class depending on the trained model class: word2vec, doc2vec or fasttext), and the reduced_windows PyArray is initialized to have the proper size, so there should not be a need to do a bounds check.
I do not know how we could implement a common method (or function) that would also fill the array with either deterministic or random values; but I do agree with you that it could be worth it.

To be honest I lack experience with cython, so that I do not feel entirely at ease with diving into this refactoring. I am willing to give it a try at some point if you want, but would prefer it to be a distinct effort (and PR) from the current one - partly because Mathis and I are pushing the shrink_windows feature due to our needing it for a current research project at work, while contributing to refactoring the code base would be something I would (willingly) do on my time off.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alas, there's a lot of pre-existing cut & paste duplication between all these related algorithms & modes that could be refactored. I think that'd be OK to note-and-defer-for-later, as either an issue or a FIXME comment in the code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, we can deal with this separately.

for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
c.reduced_windows[i] = item
else:
for i in range(c.document_len):
c.reduced_windows[i] = 0

for i in range(c.doctag_len):
c.doctag_indexes[i] = doctag_indexes[i]
Expand Down Expand Up @@ -497,8 +501,12 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
c.document_len = i

# single randint() call avoids a big thread-sync slowdown
for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
c.reduced_windows[i] = item
if model.shrink_windows:
for i, item in enumerate(model.random.randint(0, c.window, c.document_len)):
c.reduced_windows[i] = item
else:
for i in range(c.document_len):
c.reduced_windows[i] = 0

for i in range(c.doctag_len):
c.doctag_indexes[i] = doctag_indexes[i]
Expand Down
10 changes: 8 additions & 2 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
max_final_vocab=None):
max_final_vocab=None, shrink_windows=True,):
"""Train, use and evaluate word representations learned using the method
described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_,
aka FastText.
Expand Down Expand Up @@ -385,6 +385,11 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
``min_count```. If the specified ``min_count`` is more than the
automatically calculated ``min_count``, the former will be used.
Set to ``None`` if not required.
shrink_windows : bool, optional
If True, the effective window size is uniformly sampled from [1, `window`]
for each target word during training, to match the original word2vec algorithm's
approximate weighting of context words by distance. Otherwise, the effective
window size is always fixed to `window` words to either side.

Examples
--------
Expand Down Expand Up @@ -432,7 +437,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
max_vocab_size=max_vocab_size, max_final_vocab=max_final_vocab,
min_count=min_count, sample=sample, sorted_vocab=sorted_vocab,
null_word=null_word, ns_exponent=ns_exponent, hashfxn=hashfxn,
seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha)
seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean,
min_alpha=min_alpha, shrink_windows=shrink_windows)

def _init_post_load(self, hidden_output):
num_vectors = len(self.wv.vectors)
Expand Down
17 changes: 12 additions & 5 deletions gensim/models/fasttext_corpusfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ cdef void prepare_c_structures_for_batch(
vector[vector[string]] &sentences, int sample, int hs, int window, long long *total_words,
int *effective_words, int *effective_sentences, unsigned long long *next_random, cvocab_t *vocab,
int *sentence_idx, np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
np.uint32_t *reduced_windows, int *subwords_idx_len, np.uint32_t **subwords_idx) nogil:
np.uint32_t *reduced_windows, int *subwords_idx_len, np.uint32_t **subwords_idx, int shrink_windows,
) nogil:
cdef VocabItem word
cdef string token
cdef vector[string] sent
Expand Down Expand Up @@ -88,8 +89,12 @@ cdef void prepare_c_structures_for_batch(
break

# precompute "reduced window" offsets in a single randint() call
for i in range(effective_words[0]):
reduced_windows[i] = random_int32(next_random) % window
if shrink_windows:
for i in range(effective_words[0]):
reduced_windows[i] = random_int32(next_random) % window
else:
for i in range(effective_words[0]):
reduced_windows[i] = 0


def train_epoch_sg(
Expand Down Expand Up @@ -136,6 +141,7 @@ def train_epoch_sg(
cdef long long total_sentences = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef int shrink_windows = int(model.shrink_windows)

init_ft_config(&c, model, _alpha, _work, _l1)

Expand All @@ -153,7 +159,7 @@ def train_epoch_sg(
prepare_c_structures_for_batch(
sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences,
&c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, c.codelens,
c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx)
c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx, shrink_windows)

for sent_idx in range(effective_sentences):
idx_start = c.sentence_idx[sent_idx]
Expand Down Expand Up @@ -226,6 +232,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
cdef long long total_sentences = 0
cdef long long total_effective_words = 0, total_words = 0
cdef int sent_idx, idx_start, idx_end
cdef int shrink_windows = int(model.shrink_windows)

init_ft_config(&c, model, _alpha, _work, _neu1)

Expand All @@ -243,7 +250,7 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp
prepare_c_structures_for_batch(
sentences, c.sample, c.hs, c.window, &total_words, &effective_words, &effective_sentences,
&c.next_random, vocab.get_vocab_ptr(), c.sentence_idx, c.indexes, c.codelens,
c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx)
c.codes, c.points, c.reduced_windows, c.subwords_idx_len, c.subwords_idx, shrink_windows)

for sent_idx in range(effective_sentences):
idx_start = c.sentence_idx[sent_idx]
Expand Down
8 changes: 6 additions & 2 deletions gensim/models/fasttext_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -601,8 +601,12 @@ def train_batch_any(model, sentences, alpha, _work, _neu1):
num_words, num_sentences = populate_ft_config(&c, model.wv, model.wv.buckets_word, sentences)

# precompute "reduced window" offsets in a single randint() call
for i, randint in enumerate(model.random.randint(0, c.window, num_words)):
c.reduced_windows[i] = randint
if model.shrink_windows:
for i, randint in enumerate(model.random.randint(0, c.window, num_words)):
c.reduced_windows[i] = randint
else:
for i in range(num_words):
c.reduced_windows[i] = 0

# release GIL & train on all sentences in the batch
with nogil:
Expand Down
36 changes: 26 additions & 10 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def __init__(
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
comment=None, max_final_vocab=None,
comment=None, max_final_vocab=None, shrink_windows=True,
):
"""Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.

Expand Down Expand Up @@ -345,6 +345,11 @@ def __init__(
:meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
Sequence of callbacks to be executed at specific stages during training.
shrink_windows : bool, optional
If True, the effective window size is uniformly sampled from [1, `window`]
for each target word during training, to match the original word2vec algorithm's
approximate weighting of context words by distance. Otherwise, the effective
window size is always fixed to `window` words to either side.

Examples
--------
Expand Down Expand Up @@ -377,6 +382,7 @@ def __init__(
self.min_alpha = float(min_alpha)

self.window = int(window)
self.shrink_windows = bool(shrink_windows)
self.random = np.random.RandomState(seed)

self.hs = int(hs)
Expand Down Expand Up @@ -910,12 +916,14 @@ def _do_train_epoch(
if self.sg:
examples, tally, raw_tally = train_epoch_sg(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss,
total_examples, total_words, work, neu1,
self.compute_loss,
)
else:
examples, tally, raw_tally = train_epoch_cbow(
self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1, self.compute_loss,
total_examples, total_words, work, neu1,
self.compute_loss,
)

return examples, tally, raw_tally
Expand All @@ -941,20 +949,26 @@ def _do_train_job(self, sentences, alpha, inits):
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
tally += train_batch_sg(
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
self, sentences, alpha, work,
self.compute_loss,
)
else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
tally += train_batch_cbow(
self, sentences, alpha, work, neu1,
self.compute_loss,
)
return tally, self._raw_word_count(sentences)

def _clear_post_train(self):
"""Clear any cached values that training may have invalidated."""
self.wv.norms = None

def train(
self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None, word_count=0,
queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(),
**kwargs,
self, corpus_iterable=None, corpus_file=None, total_examples=None,
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
total_words=None, epochs=None, start_alpha=None, end_alpha=None,
word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False,
callbacks=(), **kwargs,
):
"""Update the model's neural weights from a sequence of sentences.

Expand Down Expand Up @@ -1039,7 +1053,7 @@ def train(
msg=(
f"training model with {self.workers} workers on {len(self.wv)} vocabulary and "
f"{self.layer1_size} features, using sg={self.sg} hs={self.hs} sample={self.sample} "
f"negative={self.negative} window={self.window}"
f"negative={self.negative} window={self.window} shrink_windows={self.shrink_windows}"
),
)

Expand Down Expand Up @@ -1970,6 +1984,8 @@ def _load_specials(self, *args, **kwargs):
self.syn1 = self.syn1
del self.syn1
del self.trainables
if not hasattr(self, 'shrink_windows'):
self.shrink_windows = True

def get_latest_training_loss(self):
"""Get current value of the training loss.
Expand Down
Loading