From 263e77f49dd753ff73e96a63757cc245ff4b5aa0 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 17 Feb 2023 11:05:26 +0800 Subject: [PATCH] check hs and negative. add tests --- gensim/models/word2vec.py | 15 +++++++++++++-- gensim/test/test_word2vec.py | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d4a4ba992e..a3fe865b7a 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -286,11 +286,11 @@ def __init__( Training algorithm: 1 for skip-gram; otherwise CBOW. hs : {0, 1}, optional If 1, hierarchical softmax will be used for model training. - If 0, and `negative` is non-zero, negative sampling will be used. + If 0, hierarchical softmax will not be used for model training. negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. + If 0, negative sampling will not be used. ns_exponent : float, optional The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more @@ -1536,6 +1536,17 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None If the combination of input parameters is inconsistent. """ + if (not self.hs) and (not self.negative): + raise ValueError( + "You must set either 'hs' or 'negative' to be positive for proper training. " + "When both 'hs=0' and 'negative=0', there will be no training." + ) + if self.hs and self.negative: + logger.warning( + "Both hierarchical softmax and negative sampling are activated. " + "This is probably a mistake. You should set either 'hs=0' " + "or 'negative=0' to disable one of them. " + ) if self.alpha > self.min_alpha_yet_reached: logger.warning("Effective 'alpha' higher than previous training cycles") diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index a07cf08b10..74639af865 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -888,7 +888,7 @@ def test_predict_output_word(self): self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human']) # negative sampling scheme not used - model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0) + model_without_neg = word2vec.Word2Vec(sentences, min_count=1, hs=1, negative=0) self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human']) # passing indices instead of words in context @@ -1033,6 +1033,19 @@ def test_train_warning(self, loglines): warning = "Effective 'alpha' higher than previous training cycles" self.assertTrue(warning in str(loglines)) + @log_capture() + def test_train_hs_and_neg(self, loglines): + """ + Test if ValueError is raised when both hs=0 and negative=0 + Test if warning is raised if both hs and negative are activated + """ + with self.assertRaises(ValueError): + word2vec.Word2Vec(sentences, min_count=1, hs=0, negative=0) + + word2vec.Word2Vec(sentences, min_count=1, hs=1, negative=5) + warning = "Both hierarchical softmax and negative sampling are activated." + self.assertTrue(warning in str(loglines)) + def test_train_with_explicit_param(self): model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences)