Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose max_final_vocab parameter in FastText constructor #2516

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 44 additions & 7 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,11 +460,38 @@ class FastText(BaseWordEmbeddingsModel):
for the internal structure of words, besides their concurrence counts.

"""
def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
compatible_hash=True):
def __init__(
self,
sentences=None,
corpus_file=None,
sg=0,
hs=0,
size=100,
alpha=0.025,
window=5,
min_count=5,
max_vocab_size=None,
max_final_vocab=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this parameter to the end of the list. We don't want to break backward compatibility for people who insist on passing keyword arguments as unnamed.

Also, please avoid mixing in innocuous formatting changes with actual functionality, as it makes your code more difficult to review. Roll back this formatting change in the constructor.

word_ngrams=1,
sample=1e-3,
seed=1,
workers=3,
min_alpha=0.0001,
negative=5,
ns_exponent=0.75,
cbow_mean=1,
hashfxn=hash,
iter=5,
null_word=0,
min_n=3,
max_n=6,
sorted_vocab=1,
bucket=2000000,
trim_rule=None,
batch_words=MAX_WORDS_IN_BATCH,
callbacks=(),
compatible_hash=True
):
"""

Parameters
Expand Down Expand Up @@ -507,6 +534,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
Limits the RAM during vocabulary building; if there are more unique
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
Set to `None` for no limit.
max_final_vocab : int, optional
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's worth explaining the difference between this and max_vocab_size. How do they interact together?

Prunes the final vocabulary to this number of word types.

Set to `None` to disable.
sample : float, optional
The threshold for configuring which higher-frequency words are randomly downsampled,
useful range is (0, 1e-5).
Expand Down Expand Up @@ -589,8 +620,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha

self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash)
self.vocabulary = FastTextVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
max_vocab_size=max_vocab_size,
max_final_vocab=max_final_vocab,
min_count=min_count,
sample=sample,
sorted_vocab=bool(sorted_vocab),
null_word=null_word,
ns_exponent=ns_exponent
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future-proof your code. If we want to add more parameters to end of this list, we won't have to change the last line.

Suggested change
ns_exponent=ns_exponent
ns_exponent=ns_exponent,

)
self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary)
self.wv.bucket = self.trainables.bucket
Expand Down