Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanity check for hs and negative in Word2Vec #3443

Merged
merged 1 commit into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,11 @@ def __init__(
Training algorithm: 1 for skip-gram; otherwise CBOW.
hs : {0, 1}, optional
If 1, hierarchical softmax will be used for model training.
If 0, and `negative` is non-zero, negative sampling will be used.
If 0, hierarchical softmax will not be used for model training.
negative : int, optional
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
If 0, negative sampling will not be used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
Expand Down Expand Up @@ -1536,6 +1536,17 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None
If the combination of input parameters is inconsistent.

"""
if (not self.hs) and (not self.negative):
raise ValueError(
"You must set either 'hs' or 'negative' to be positive for proper training. "
"When both 'hs=0' and 'negative=0', there will be no training."
)
if self.hs and self.negative:
logger.warning(
"Both hierarchical softmax and negative sampling are activated. "
"This is probably a mistake. You should set either 'hs=0' "
"or 'negative=0' to disable one of them. "
)
if self.alpha > self.min_alpha_yet_reached:
logger.warning("Effective 'alpha' higher than previous training cycles")

Expand Down
15 changes: 14 additions & 1 deletion gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ def test_predict_output_word(self):
self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human'])

# negative sampling scheme not used
model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0)
model_without_neg = word2vec.Word2Vec(sentences, min_count=1, hs=1, negative=0)
self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human'])

# passing indices instead of words in context
Expand Down Expand Up @@ -1033,6 +1033,19 @@ def test_train_warning(self, loglines):
warning = "Effective 'alpha' higher than previous training cycles"
self.assertTrue(warning in str(loglines))

@log_capture()
def test_train_hs_and_neg(self, loglines):
"""
Test if ValueError is raised when both hs=0 and negative=0
Test if warning is raised if both hs and negative are activated
"""
with self.assertRaises(ValueError):
word2vec.Word2Vec(sentences, min_count=1, hs=0, negative=0)

word2vec.Word2Vec(sentences, min_count=1, hs=1, negative=5)
warning = "Both hierarchical softmax and negative sampling are activated."
self.assertTrue(warning in str(loglines))

def test_train_with_explicit_param(self):
model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0)
model.build_vocab(sentences)
Expand Down