Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement shrink_windows argument for Word2Vec. #3169

Merged
merged 14 commits into from
Jun 29, 2021
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added tests for shrink_windows=False in Word2Vec-based models.
  • Loading branch information
pandrey-fr committed Jun 17, 2021
commit 486b3f502e45b4bc06bf1a9196c9de41587478b1
38 changes: 38 additions & 0 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
@@ -589,6 +589,44 @@ def test_dmc_neg_fromfile(self):
)
self.model_sanity(model)

def test_dmm_fixedwindowsize(self):
"""Test DMM doc2vec training with fixed window size."""
model = doc2vec.Doc2Vec(
list_corpus, vector_size=24,
dm=1, dm_mean=1, window=4, shrink_windows=False,
hs=0, negative=10, alpha=0.05, min_count=2, epochs=20
)
self.model_sanity(model)

def test_dmm_fixedwindowsize_fromfile(self):
"""Test DMM doc2vec training with fixed window size, from file."""
with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
save_lee_corpus_as_line_sentence(corpus_file)
model = doc2vec.Doc2Vec(
corpus_file=corpus_file, vector_size=24,
dm=1, dm_mean=1, window=4, shrink_windows=False,
hs=0, negative=10, alpha=0.05, min_count=2, epochs=20
)
self.model_sanity(model)

def test_dbow_fixedwindowsize(self):
"""Test DBOW doc2vec training with fixed window size."""
model = doc2vec.Doc2Vec(
list_corpus, vector_size=16, shrink_windows=False,
dm=0, hs=0, negative=5, min_count=2, epochs=20
)
self.model_sanity(model)

def test_dbow_fixedwindowsize_fromfile(self):
"""Test DBOW doc2vec training with fixed window size, from file."""
with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
save_lee_corpus_as_line_sentence(corpus_file)
model = doc2vec.Doc2Vec(
corpus_file=corpus_file, vector_size=16, shrink_windows=False,
dm=0, hs=0, negative=5, min_count=2, epochs=20
)
self.model_sanity(model)

def test_parallel(self):
"""Test doc2vec parallel training with more than default 3 threads."""
# repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words)
28 changes: 20 additions & 8 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
@@ -397,12 +397,12 @@ def test_wm_distance(self):
dist = self.test_model.wv.wmdistance(doc, oov_doc)
self.assertNotEqual(float('inf'), dist)

def test_cbow_hs_training(self):
def test_cbow_hs_training(self, shrink_windows=True):

model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
@@ -429,12 +429,12 @@ def test_cbow_hs_training(self):
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_cbow_hs_training_fromfile(self):
def test_cbow_hs_training_fromfile(self, shrink_windows=True):
with temporary_file('gensim_fasttext.tst') as corpus_file:
model_gensim = FT_gensim(
vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
@@ -465,12 +465,12 @@ def test_cbow_hs_training_fromfile(self):
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_sg_hs_training(self):
def test_sg_hs_training(self, shrink_windows=True):

model_gensim = FT_gensim(
vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
@@ -497,12 +497,12 @@ def test_sg_hs_training(self):
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_sg_hs_training_fromfile(self):
def test_sg_hs_training_fromfile(self, shrink_windows=True):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://docs.pytest.org/en/6.2.x/example/parametrize.html

Suggested change
def test_sg_hs_training_fromfile(self, shrink_windows=True):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_sg_hs_training_fromfile(self, shrink_windows):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pushed two commits making use of that feature. The first one only applied to the FastText tests, as they were the one you attached this review to. The second one applies to the other two test sets (wor2vec and doc2vec); if you feel like the latter should not have been changed, we can discard this commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: actually, pytest.mark.parametrize does not work as expected on methods (although it does on functions). I will try to find a way to make it work, otherwise I will undo the last two commits.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, in fact it probably will not work due to the test classes being unittest.TestCase subclasses.

https://docs.pytest.org/en/6.2.x/unittest.html

The following pytest features do not work [in unittest.TestCase subclasses], and probably never will due to different design philosophies: Fixtures (except for autouse fixtures, see below); Parametrization; Custom hooks;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am revoking the last two commits as a consequence.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing that out. I didn't know pytest parameterization is incompatible with unittest.TestCase.

Luckily, in this particular case, the unittest.TestCase class isn't doing anything. I'll get parameterization working in a separate commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, we both learned something :)

with temporary_file('gensim_fasttext.tst') as corpus_file:
model_gensim = FT_gensim(
vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
@@ -533,6 +533,18 @@ def test_sg_hs_training_fromfile(self):
overlap_count, 2,
"only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))

def test_cbow_hs_training_fixedwindowsize(self):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
self.test_cbow_hs_training(shrink_windows=False)

def test_cbow_hs_training_fixedwindowsize_fromfile(self):
self.test_cbow_hs_training_fromfile(shrink_windows=False)

def test_sg_hs_training_fixedwindowsize(self):
self.test_sg_hs_training(shrink_windows=False)

def test_sg_hs_training_fixedwindowsize_fromfile(self):
self.test_sg_hs_training_fromfile(shrink_windows=False)

def test_cbow_neg_training(self):

model_gensim = FT_gensim(
32 changes: 32 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
@@ -686,6 +686,38 @@ def test_cbow_neg_fromfile(self):
)
self.model_sanity(model, with_corpus_file=True)

def test_sg_fixedwindowsize(self):
"""Test skipgram with fixed window size. Use NS."""
model = word2vec.Word2Vec(
sg=1, window=5, shrink_windows=False, hs=0,
negative=15, min_count=5, epochs=10, workers=2
)
self.model_sanity(model)

def test_sg_fixedwindowsize_fromfile(self):
"""Test skipgram with fixed window size. Use HS and train from file."""
model = word2vec.Word2Vec(
sg=1, window=5, shrink_windows=False, hs=1,
negative=0, min_count=5, epochs=10, workers=2
)
self.model_sanity(model, with_corpus_file=True)

def test_cbow_fixedwindowsize(self, ranks=None):
"""Test CBOW with fixed window size. Use HS."""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False,
hs=1, negative=0, min_count=5, epochs=10, workers=2
)
self.model_sanity(model, ranks=ranks)

def test_cbow_fixedwindowsize_fromfile(self):
"""Test CBOW with fixed window size. Use NS and train from file."""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False,
hs=0, negative=15, min_count=5, epochs=10, workers=2
)
self.model_sanity(model, with_corpus_file=True)

def test_cosmul(self):
model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0)
sims = model.wv.most_similar_cosmul('graph', topn=10)