Skip to content

Commit

Permalink
Parameterized shrink_windows tests for Word2Vec and Doc2Vec.
Browse files Browse the repository at this point in the history
Note: as with the previous version, the non-default parameter
      `shrink_windows=False` is tested only on a few cases,
      in order to limit the extra computational cost while
      making sure all backend code paths are used.
Co-Authored-By: Michael Penkov <m@penkov.dev>
  • Loading branch information
pandrey-fr and mpenkov committed Jun 22, 2021
1 parent 22962db commit 3481607
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 86 deletions.
67 changes: 21 additions & 46 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from collections import namedtuple

import numpy as np
import pytest
from testfixtures import log_capture

from gensim import utils
Expand Down Expand Up @@ -523,33 +524,45 @@ def test_dmc_hs_fromfile(self):
)
self.model_sanity(model)

def test_dbow_neg(self):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_dbow_neg(self, shrink_windows):
"""Test DBOW doc2vec training."""
model = doc2vec.Doc2Vec(list_corpus, vector_size=16, dm=0, hs=0, negative=5, min_count=2, epochs=40)
model = doc2vec.Doc2Vec(
list_corpus, vector_size=16, dm=0, hs=0, negative=5,
min_count=2, epochs=40, shrink_windows=shrink_windows
)
self.model_sanity(model)

def test_dbow_neg_fromfile(self):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_dbow_neg_fromfile(self, shrink_windows):
"""Test DBOW doc2vec training."""
with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
save_lee_corpus_as_line_sentence(corpus_file)
model = doc2vec.Doc2Vec(list_corpus, vector_size=16, dm=0, hs=0, negative=5, min_count=2, epochs=40)
model = doc2vec.Doc2Vec(
list_corpus, vector_size=16, dm=0, hs=0, negative=5,
min_count=2, epochs=40, shrink_windows=shrink_windows
)
self.model_sanity(model)

def test_dmm_neg(self):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_dmm_neg(self, shrink_windows):
"""Test DM/mean doc2vec training."""
model = doc2vec.Doc2Vec(
list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0,
negative=10, alpha=0.05, min_count=2, epochs=20
negative=10, alpha=0.05, min_count=2, epochs=20,
shrink_windows=shrink_windows
)
self.model_sanity(model)

def test_dmm_neg_fromfile(self):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_dmm_neg_fromfile(self, shrink_windows):
"""Test DBOW doc2vec training."""
with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
save_lee_corpus_as_line_sentence(corpus_file)
model = doc2vec.Doc2Vec(
list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0,
negative=10, alpha=0.05, min_count=2, epochs=20
negative=10, alpha=0.05, min_count=2, epochs=20,
shrink_windows=shrink_windows
)
self.model_sanity(model)

Expand Down Expand Up @@ -589,44 +602,6 @@ def test_dmc_neg_fromfile(self):
)
self.model_sanity(model)

def test_dmm_fixedwindowsize(self):
"""Test DMM doc2vec training with fixed window size."""
model = doc2vec.Doc2Vec(
list_corpus, vector_size=24,
dm=1, dm_mean=1, window=4, shrink_windows=False,
hs=0, negative=10, alpha=0.05, min_count=2, epochs=20
)
self.model_sanity(model)

def test_dmm_fixedwindowsize_fromfile(self):
"""Test DMM doc2vec training with fixed window size, from file."""
with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
save_lee_corpus_as_line_sentence(corpus_file)
model = doc2vec.Doc2Vec(
corpus_file=corpus_file, vector_size=24,
dm=1, dm_mean=1, window=4, shrink_windows=False,
hs=0, negative=10, alpha=0.05, min_count=2, epochs=20
)
self.model_sanity(model)

def test_dbow_fixedwindowsize(self):
"""Test DBOW doc2vec training with fixed window size."""
model = doc2vec.Doc2Vec(
list_corpus, vector_size=16, shrink_windows=False,
dm=0, hs=0, negative=5, min_count=2, epochs=20
)
self.model_sanity(model)

def test_dbow_fixedwindowsize_fromfile(self):
"""Test DBOW doc2vec training with fixed window size, from file."""
with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
save_lee_corpus_as_line_sentence(corpus_file)
model = doc2vec.Doc2Vec(
corpus_file=corpus_file, vector_size=16, shrink_windows=False,
dm=0, hs=0, negative=5, min_count=2, epochs=20
)
self.model_sanity(model)

def test_parallel(self):
"""Test doc2vec parallel training with more than default 3 threads."""
# repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words)
Expand Down
61 changes: 21 additions & 40 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sys

import numpy as np
import pytest

from testfixtures import log_capture

Expand Down Expand Up @@ -625,13 +626,21 @@ def test_sg_hs_fromfile(self):
model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2)
self.model_sanity(model, with_corpus_file=True)

def test_sg_neg(self):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_sg_neg(self, shrink_windows):
"""Test skipgram w/ negative sampling"""
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2)
model = word2vec.Word2Vec(
sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2,
shrink_windows=shrink_windows
)
self.model_sanity(model)

def test_sg_neg_fromfile(self):
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2)
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_sg_neg_fromfile(self, shrink_windows):
model = word2vec.Word2Vec(
sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2,
shrink_windows=shrink_windows
)
self.model_sanity(model, with_corpus_file=True)

@unittest.skipIf('BULK_TEST_REPS' not in os.environ, reason="bulk test only occasionally run locally")
Expand All @@ -656,18 +665,22 @@ def test_method_in_bulk(self):
print(np.mean(ranks))
self.assertEquals(failures, 0, "too many failures")

def test_cbow_hs(self, ranks=None):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_cbow_hs(self, ranks=None, shrink_windows=True):
"""Test CBOW w/ hierarchical softmax"""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=2, hs=1, negative=0,
min_count=5, epochs=60, workers=2, batch_words=1000
min_count=5, epochs=60, workers=2, batch_words=1000,
shrink_windows=shrink_windows
)
self.model_sanity(model, ranks=ranks)

def test_cbow_hs_fromfile(self):
@pytest.mark.parametrize('shrink_windows', [True, False])
def test_cbow_hs_fromfile(self, shrink_windows):
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=2, hs=1, negative=0,
min_count=5, epochs=60, workers=2, batch_words=1000
min_count=5, epochs=60, workers=2, batch_words=1000,
shrink_windows=shrink_windows
)
self.model_sanity(model, with_corpus_file=True)

Expand All @@ -686,38 +699,6 @@ def test_cbow_neg_fromfile(self):
)
self.model_sanity(model, with_corpus_file=True)

def test_sg_fixedwindowsize(self):
"""Test skipgram with fixed window size. Use NS."""
model = word2vec.Word2Vec(
sg=1, window=5, shrink_windows=False, hs=0,
negative=15, min_count=5, epochs=10, workers=2
)
self.model_sanity(model)

def test_sg_fixedwindowsize_fromfile(self):
"""Test skipgram with fixed window size. Use HS and train from file."""
model = word2vec.Word2Vec(
sg=1, window=5, shrink_windows=False, hs=1,
negative=0, min_count=5, epochs=10, workers=2
)
self.model_sanity(model, with_corpus_file=True)

def test_cbow_fixedwindowsize(self, ranks=None):
"""Test CBOW with fixed window size. Use HS."""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False,
hs=1, negative=0, min_count=5, epochs=10, workers=2
)
self.model_sanity(model, ranks=ranks)

def test_cbow_fixedwindowsize_fromfile(self):
"""Test CBOW with fixed window size. Use NS and train from file."""
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False,
hs=0, negative=15, min_count=5, epochs=10, workers=2
)
self.model_sanity(model, with_corpus_file=True)

def test_cosmul(self):
model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0)
sims = model.wv.most_similar_cosmul('graph', topn=10)
Expand Down

0 comments on commit 3481607

Please sign in to comment.