diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index ebb1af7cbe..016bbf2091 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -16,6 +16,7 @@ from collections import namedtuple import numpy as np +import pytest from testfixtures import log_capture from gensim import utils @@ -523,33 +524,45 @@ def test_dmc_hs_fromfile(self): ) self.model_sanity(model) - def test_dbow_neg(self): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_dbow_neg(self, shrink_windows): """Test DBOW doc2vec training.""" - model = doc2vec.Doc2Vec(list_corpus, vector_size=16, dm=0, hs=0, negative=5, min_count=2, epochs=40) + model = doc2vec.Doc2Vec( + list_corpus, vector_size=16, dm=0, hs=0, negative=5, + min_count=2, epochs=40, shrink_windows=shrink_windows + ) self.model_sanity(model) - def test_dbow_neg_fromfile(self): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_dbow_neg_fromfile(self, shrink_windows): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) - model = doc2vec.Doc2Vec(list_corpus, vector_size=16, dm=0, hs=0, negative=5, min_count=2, epochs=40) + model = doc2vec.Doc2Vec( + list_corpus, vector_size=16, dm=0, hs=0, negative=5, + min_count=2, epochs=40, shrink_windows=shrink_windows + ) self.model_sanity(model) - def test_dmm_neg(self): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_dmm_neg(self, shrink_windows): """Test DM/mean doc2vec training.""" model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0, - negative=10, alpha=0.05, min_count=2, epochs=20 + negative=10, alpha=0.05, min_count=2, epochs=20, + shrink_windows=shrink_windows ) self.model_sanity(model) - def test_dmm_neg_fromfile(self): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_dmm_neg_fromfile(self, shrink_windows): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0, - negative=10, alpha=0.05, min_count=2, epochs=20 + negative=10, alpha=0.05, min_count=2, epochs=20, + shrink_windows=shrink_windows ) self.model_sanity(model) @@ -589,44 +602,6 @@ def test_dmc_neg_fromfile(self): ) self.model_sanity(model) - def test_dmm_fixedwindowsize(self): - """Test DMM doc2vec training with fixed window size.""" - model = doc2vec.Doc2Vec( - list_corpus, vector_size=24, - dm=1, dm_mean=1, window=4, shrink_windows=False, - hs=0, negative=10, alpha=0.05, min_count=2, epochs=20 - ) - self.model_sanity(model) - - def test_dmm_fixedwindowsize_fromfile(self): - """Test DMM doc2vec training with fixed window size, from file.""" - with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: - save_lee_corpus_as_line_sentence(corpus_file) - model = doc2vec.Doc2Vec( - corpus_file=corpus_file, vector_size=24, - dm=1, dm_mean=1, window=4, shrink_windows=False, - hs=0, negative=10, alpha=0.05, min_count=2, epochs=20 - ) - self.model_sanity(model) - - def test_dbow_fixedwindowsize(self): - """Test DBOW doc2vec training with fixed window size.""" - model = doc2vec.Doc2Vec( - list_corpus, vector_size=16, shrink_windows=False, - dm=0, hs=0, negative=5, min_count=2, epochs=20 - ) - self.model_sanity(model) - - def test_dbow_fixedwindowsize_fromfile(self): - """Test DBOW doc2vec training with fixed window size, from file.""" - with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: - save_lee_corpus_as_line_sentence(corpus_file) - model = doc2vec.Doc2Vec( - corpus_file=corpus_file, vector_size=16, shrink_windows=False, - dm=0, hs=0, negative=5, min_count=2, epochs=20 - ) - self.model_sanity(model) - def test_parallel(self): """Test doc2vec parallel training with more than default 3 threads.""" # repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index e85cee0d5d..5878822fb0 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -15,6 +15,7 @@ import sys import numpy as np +import pytest from testfixtures import log_capture @@ -625,13 +626,21 @@ def test_sg_hs_fromfile(self): model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) - def test_sg_neg(self): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_sg_neg(self, shrink_windows): """Test skipgram w/ negative sampling""" - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) + model = word2vec.Word2Vec( + sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2, + shrink_windows=shrink_windows + ) self.model_sanity(model) - def test_sg_neg_fromfile(self): - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_sg_neg_fromfile(self, shrink_windows): + model = word2vec.Word2Vec( + sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2, + shrink_windows=shrink_windows + ) self.model_sanity(model, with_corpus_file=True) @unittest.skipIf('BULK_TEST_REPS' not in os.environ, reason="bulk test only occasionally run locally") @@ -656,18 +665,22 @@ def test_method_in_bulk(self): print(np.mean(ranks)) self.assertEquals(failures, 0, "too many failures") - def test_cbow_hs(self, ranks=None): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_cbow_hs(self, ranks=None, shrink_windows=True): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.1, window=2, hs=1, negative=0, - min_count=5, epochs=60, workers=2, batch_words=1000 + min_count=5, epochs=60, workers=2, batch_words=1000, + shrink_windows=shrink_windows ) self.model_sanity(model, ranks=ranks) - def test_cbow_hs_fromfile(self): + @pytest.mark.parametrize('shrink_windows', [True, False]) + def test_cbow_hs_fromfile(self, shrink_windows): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.1, window=2, hs=1, negative=0, - min_count=5, epochs=60, workers=2, batch_words=1000 + min_count=5, epochs=60, workers=2, batch_words=1000, + shrink_windows=shrink_windows ) self.model_sanity(model, with_corpus_file=True) @@ -686,38 +699,6 @@ def test_cbow_neg_fromfile(self): ) self.model_sanity(model, with_corpus_file=True) - def test_sg_fixedwindowsize(self): - """Test skipgram with fixed window size. Use NS.""" - model = word2vec.Word2Vec( - sg=1, window=5, shrink_windows=False, hs=0, - negative=15, min_count=5, epochs=10, workers=2 - ) - self.model_sanity(model) - - def test_sg_fixedwindowsize_fromfile(self): - """Test skipgram with fixed window size. Use HS and train from file.""" - model = word2vec.Word2Vec( - sg=1, window=5, shrink_windows=False, hs=1, - negative=0, min_count=5, epochs=10, workers=2 - ) - self.model_sanity(model, with_corpus_file=True) - - def test_cbow_fixedwindowsize(self, ranks=None): - """Test CBOW with fixed window size. Use HS.""" - model = word2vec.Word2Vec( - sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False, - hs=1, negative=0, min_count=5, epochs=10, workers=2 - ) - self.model_sanity(model, ranks=ranks) - - def test_cbow_fixedwindowsize_fromfile(self): - """Test CBOW with fixed window size. Use NS and train from file.""" - model = word2vec.Word2Vec( - sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False, - hs=0, negative=15, min_count=5, epochs=10, workers=2 - ) - self.model_sanity(model, with_corpus_file=True) - def test_cosmul(self): model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) sims = model.wv.most_similar_cosmul('graph', topn=10)