Added tests for shrink_windows=False in Word2Vec-based models.

piskvorky · mpenkov · Jun 29, 2021 · Jun 7, 2021 · Jun 10, 2021 · Jun 10, 2021
commit 486b3f502e45b4bc06bf1a9196c9de41587478b1
diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -589,6 +589,44 @@ def test_dmc_neg_fromfile(self):
             )
             self.model_sanity(model)
 
+    def test_dmm_fixedwindowsize(self):
+        """Test DMM doc2vec training with fixed window size."""
+        model = doc2vec.Doc2Vec(
+            list_corpus, vector_size=24,
+            dm=1, dm_mean=1, window=4, shrink_windows=False,
+            hs=0, negative=10, alpha=0.05, min_count=2, epochs=20
+        )
+        self.model_sanity(model)
+
+    def test_dmm_fixedwindowsize_fromfile(self):
+        """Test DMM doc2vec training with fixed window size, from file."""
+        with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
+            save_lee_corpus_as_line_sentence(corpus_file)
+            model = doc2vec.Doc2Vec(
+                corpus_file=corpus_file, vector_size=24,
+                dm=1, dm_mean=1, window=4, shrink_windows=False,
+                hs=0, negative=10, alpha=0.05, min_count=2, epochs=20
+            )
+            self.model_sanity(model)
+
+    def test_dbow_fixedwindowsize(self):
+        """Test DBOW doc2vec training with fixed window size."""
+        model = doc2vec.Doc2Vec(
+            list_corpus, vector_size=16, shrink_windows=False,
+            dm=0, hs=0, negative=5, min_count=2, epochs=20
+        )
+        self.model_sanity(model)
+
+    def test_dbow_fixedwindowsize_fromfile(self):
+        """Test DBOW doc2vec training with fixed window size, from file."""
+        with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
+            save_lee_corpus_as_line_sentence(corpus_file)
+            model = doc2vec.Doc2Vec(
+                corpus_file=corpus_file, vector_size=16, shrink_windows=False,
+                dm=0, hs=0, negative=5, min_count=2, epochs=20
+            )
+            self.model_sanity(model)
+
     def test_parallel(self):
         """Test doc2vec parallel training with more than default 3 threads."""
         # repeat the ~300 doc (~60000 word) Lee corpus to get 6000 docs (~1.2M words)

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -397,12 +397,12 @@ def test_wm_distance(self):
         dist = self.test_model.wv.wmdistance(doc, oov_doc)
         self.assertNotEqual(float('inf'), dist)
 
-    def test_cbow_hs_training(self):
+    def test_cbow_hs_training(self, shrink_windows=True):
 
         model_gensim = FT_gensim(
             vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
             min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
+            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -429,12 +429,12 @@ def test_cbow_hs_training(self):
             overlap_count, 2,
             "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))
 
-    def test_cbow_hs_training_fromfile(self):
+    def test_cbow_hs_training_fromfile(self, shrink_windows=True):
         with temporary_file('gensim_fasttext.tst') as corpus_file:
             model_gensim = FT_gensim(
                 vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
                 min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)
+                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows)
 
             lee_data = LineSentence(datapath('lee_background.cor'))
             utils.save_as_line_sentence(lee_data, corpus_file)
@@ -465,12 +465,12 @@ def test_cbow_hs_training_fromfile(self):
                 overlap_count, 2,
                 "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))
 
-    def test_sg_hs_training(self):
+    def test_sg_hs_training(self, shrink_windows=True):
 
         model_gensim = FT_gensim(
             vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
             min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
+            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -497,12 +497,12 @@ def test_sg_hs_training(self):
             overlap_count, 2,
             "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))
 
-    def test_sg_hs_training_fromfile(self):
+    def test_sg_hs_training_fromfile(self, shrink_windows=True):
-    def test_sg_hs_training_fromfile(self, shrink_windows=True):
+    @pytest.mark.parametrize('shrink_windows', [True, False])
+    def test_sg_hs_training_fromfile(self, shrink_windows):
-    def test_sg_hs_training_fromfile(self, shrink_windows=True):
+    @pytest.mark.parametrize('shrink_windows', [True, False])
+    def test_sg_hs_training_fromfile(self, shrink_windows):
         with temporary_file('gensim_fasttext.tst') as corpus_file:
             model_gensim = FT_gensim(
                 vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
                 min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
+                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows)
 
             lee_data = LineSentence(datapath('lee_background.cor'))
             utils.save_as_line_sentence(lee_data, corpus_file)
@@ -533,6 +533,18 @@ def test_sg_hs_training_fromfile(self):
                 overlap_count, 2,
                 "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words))
 
+    def test_cbow_hs_training_fixedwindowsize(self):
+        self.test_cbow_hs_training(shrink_windows=False)
+
+    def test_cbow_hs_training_fixedwindowsize_fromfile(self):
+        self.test_cbow_hs_training_fromfile(shrink_windows=False)
+
+    def test_sg_hs_training_fixedwindowsize(self):
+        self.test_sg_hs_training(shrink_windows=False)
+
+    def test_sg_hs_training_fixedwindowsize_fromfile(self):
+        self.test_sg_hs_training_fromfile(shrink_windows=False)
+
     def test_cbow_neg_training(self):
 
         model_gensim = FT_gensim(

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -686,6 +686,38 @@ def test_cbow_neg_fromfile(self):
         )
         self.model_sanity(model, with_corpus_file=True)
 
+    def test_sg_fixedwindowsize(self):
+        """Test skipgram with fixed window size. Use NS."""
+        model = word2vec.Word2Vec(
+            sg=1, window=5, shrink_windows=False, hs=0,
+            negative=15, min_count=5, epochs=10, workers=2
+        )
+        self.model_sanity(model)
+
+    def test_sg_fixedwindowsize_fromfile(self):
+        """Test skipgram with fixed window size. Use HS and train from file."""
+        model = word2vec.Word2Vec(
+            sg=1, window=5, shrink_windows=False, hs=1,
+            negative=0, min_count=5, epochs=10, workers=2
+        )
+        self.model_sanity(model, with_corpus_file=True)
+
+    def test_cbow_fixedwindowsize(self, ranks=None):
+        """Test CBOW with fixed window size. Use HS."""
+        model = word2vec.Word2Vec(
+            sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False,
+            hs=1, negative=0, min_count=5, epochs=10, workers=2
+        )
+        self.model_sanity(model, ranks=ranks)
+
+    def test_cbow_fixedwindowsize_fromfile(self):
+        """Test CBOW with fixed window size. Use NS and train from file."""
+        model = word2vec.Word2Vec(
+            sg=0, cbow_mean=1, alpha=0.1, window=5, shrink_windows=False,
+            hs=0, negative=15, min_count=5, epochs=10, workers=2
+        )
+        self.model_sanity(model, with_corpus_file=True)
+
     def test_cosmul(self):
         model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0)
         sims = model.wv.most_similar_cosmul('graph', topn=10)