From 2a41200b23ace0e8ff1b1b1d48bd87fcf5cfc654 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jun 2021 15:11:16 +0900 Subject: [PATCH] polishing up after #3169 The repo wasn't accepting maintainer commits, so I'm taking care of this here. --- CHANGELOG.md | 1 + gensim/test/test_fasttext.py | 290 +++++++++++++++++------------------ 2 files changed, 142 insertions(+), 149 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b4a0a0842..a5db70b69a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Changes * [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar) * [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0) * [#2965](https://github.com/RaRe-Technologies/gensim/pull/2965): Remove strip_punctuation2 alias of strip_punctuation, by [@sciatro](https://github.com/sciatro) +* [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec., by [@M-Demay](https://github.com/M-Demay) ### :books: Documentation diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 382ecdfb2c..682972d68d 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -13,6 +13,7 @@ import sys import numpy as np +import pytest from gensim import utils from gensim.models.word2vec import LineSentence @@ -397,156 +398,7 @@ def test_wm_distance(self): dist = self.test_model.wv.wmdistance(doc, oov_doc) self.assertNotEqual(float('inf'), dist) - def test_cbow_hs_training(self, shrink_windows=True): - - model_gensim = FT_gensim( - vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.vectors[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) - self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training - - sims_gensim = model_gensim.wv.most_similar('night', topn=10) - sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words - expected_sims_words = [ - u'night,', - u'night.', - u'rights', - u'kilometres', - u'in', - u'eight', - u'according', - u'flights', - u'during', - u'comes'] - overlaps = set(sims_gensim_words).intersection(expected_sims_words) - overlap_count = len(overlaps) - self.assertGreaterEqual( - overlap_count, 2, - "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - - def test_cbow_hs_training_fromfile(self, shrink_windows=True): - with temporary_file('gensim_fasttext.tst') as corpus_file: - model_gensim = FT_gensim( - vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows) - - lee_data = LineSentence(datapath('lee_background.cor')) - utils.save_as_line_sentence(lee_data, corpus_file) - - model_gensim.build_vocab(corpus_file=corpus_file) - orig0 = np.copy(model_gensim.wv.vectors[0]) - model_gensim.train(corpus_file=corpus_file, - total_words=model_gensim.corpus_total_words, - epochs=model_gensim.epochs) - self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training - - sims_gensim = model_gensim.wv.most_similar('night', topn=10) - sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words - expected_sims_words = [ - u'night,', - u'night.', - u'rights', - u'kilometres', - u'in', - u'eight', - u'according', - u'flights', - u'during', - u'comes'] - overlaps = set(sims_gensim_words).intersection(expected_sims_words) - overlap_count = len(overlaps) - self.assertGreaterEqual( - overlap_count, 2, - "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - - def test_sg_hs_training(self, shrink_windows=True): - - model_gensim = FT_gensim( - vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.vectors[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) - self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training - - sims_gensim = model_gensim.wv.most_similar('night', topn=10) - sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words - expected_sims_words = [ - u'night,', - u'night.', - u'eight', - u'nine', - u'overnight', - u'crew', - u'overnight.', - u'manslaughter', - u'north', - u'flight'] - overlaps = set(sims_gensim_words).intersection(expected_sims_words) - overlap_count = len(overlaps) - self.assertGreaterEqual( - overlap_count, 2, - "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - - def test_sg_hs_training_fromfile(self, shrink_windows=True): - with temporary_file('gensim_fasttext.tst') as corpus_file: - model_gensim = FT_gensim( - vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) - - lee_data = LineSentence(datapath('lee_background.cor')) - utils.save_as_line_sentence(lee_data, corpus_file) - - model_gensim.build_vocab(corpus_file=corpus_file) - orig0 = np.copy(model_gensim.wv.vectors[0]) - model_gensim.train(corpus_file=corpus_file, - total_words=model_gensim.corpus_total_words, - epochs=model_gensim.epochs) - self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training - - sims_gensim = model_gensim.wv.most_similar('night', topn=10) - sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words - expected_sims_words = [ - u'night,', - u'night.', - u'eight', - u'nine', - u'overnight', - u'crew', - u'overnight.', - u'manslaughter', - u'north', - u'flight'] - overlaps = set(sims_gensim_words).intersection(expected_sims_words) - overlap_count = len(overlaps) - self.assertGreaterEqual( - overlap_count, 2, - "only %i overlap in expected %s & actual %s" % (overlap_count, expected_sims_words, sims_gensim_words)) - - def test_cbow_hs_training_fixedwindowsize(self): - self.test_cbow_hs_training(shrink_windows=False) - - def test_cbow_hs_training_fixedwindowsize_fromfile(self): - self.test_cbow_hs_training_fromfile(shrink_windows=False) - - def test_sg_hs_training_fixedwindowsize(self): - self.test_sg_hs_training(shrink_windows=False) - - def test_sg_hs_training_fixedwindowsize_fromfile(self): - self.test_sg_hs_training_fromfile(shrink_windows=False) - def test_cbow_neg_training(self): - model_gensim = FT_gensim( vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, @@ -863,6 +715,146 @@ def obsolete_testLoadOldModel(self): self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100)) +@pytest.mark.parametrize('shrink_windows', [True, False]) +def test_cbow_hs_training(shrink_windows): + model_gensim = FT_gensim( + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + orig0 = np.copy(model_gensim.wv.vectors[0]) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) + assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training + + sims_gensim = model_gensim.wv.most_similar('night', topn=10) + sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words + expected_sims_words = [ + u'night,', + u'night.', + u'rights', + u'kilometres', + u'in', + u'eight', + u'according', + u'flights', + u'during', + u'comes'] + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + + message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}" + assert overlap_count >= 2, message + + +@pytest.mark.parametrize('shrink_windows', [True, False]) +def test_cbow_hs_training_fromfile(shrink_windows): + with temporary_file('gensim_fasttext.tst') as corpus_file: + model_gensim = FT_gensim( + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4, shrink_windows=shrink_windows) + + lee_data = LineSentence(datapath('lee_background.cor')) + utils.save_as_line_sentence(lee_data, corpus_file) + + model_gensim.build_vocab(corpus_file=corpus_file) + orig0 = np.copy(model_gensim.wv.vectors[0]) + model_gensim.train(corpus_file=corpus_file, + total_words=model_gensim.corpus_total_words, + epochs=model_gensim.epochs) + assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training + + sims_gensim = model_gensim.wv.most_similar('night', topn=10) + sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words + expected_sims_words = [ + u'night,', + u'night.', + u'rights', + u'kilometres', + u'in', + u'eight', + u'according', + u'flights', + u'during', + u'comes'] + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}" + assert overlap_count >= 2, message + + +@pytest.mark.parametrize('shrink_windows', [True, False]) +def test_sg_hs_training(shrink_windows): + model_gensim = FT_gensim( + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + orig0 = np.copy(model_gensim.wv.vectors[0]) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) + assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training + + sims_gensim = model_gensim.wv.most_similar('night', topn=10) + sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words + expected_sims_words = [ + u'night,', + u'night.', + u'eight', + u'nine', + u'overnight', + u'crew', + u'overnight.', + u'manslaughter', + u'north', + u'flight'] + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + + message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}" + assert overlap_count >= 2, message + + +@pytest.mark.parametrize('shrink_windows', [True, False]) +def test_sg_hs_training_fromfile(shrink_windows): + with temporary_file('gensim_fasttext.tst') as corpus_file: + model_gensim = FT_gensim( + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, epochs=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET, shrink_windows=shrink_windows) + + lee_data = LineSentence(datapath('lee_background.cor')) + utils.save_as_line_sentence(lee_data, corpus_file) + + model_gensim.build_vocab(corpus_file=corpus_file) + orig0 = np.copy(model_gensim.wv.vectors[0]) + model_gensim.train(corpus_file=corpus_file, + total_words=model_gensim.corpus_total_words, + epochs=model_gensim.epochs) + assert not (orig0 == model_gensim.wv.vectors[0]).all() # vector should vary after training + + sims_gensim = model_gensim.wv.most_similar('night', topn=10) + sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words + expected_sims_words = [ + u'night,', + u'night.', + u'eight', + u'nine', + u'overnight', + u'crew', + u'overnight.', + u'manslaughter', + u'north', + u'flight'] + overlaps = set(sims_gensim_words).intersection(expected_sims_words) + overlap_count = len(overlaps) + message = f"only {overlap_count} overlap in expected {expected_sims_words} & actual {sims_gensim_words}" + assert overlap_count >= 2, message + + with open(datapath('toy-data.txt')) as fin: TOY_SENTENCES = [fin.read().strip().split(' ')]