-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expose max_final_vocab parameter in FastText constructor #2516
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -460,11 +460,38 @@ class FastText(BaseWordEmbeddingsModel): | |||||
for the internal structure of words, besides their concurrence counts. | ||||||
|
||||||
""" | ||||||
def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, | ||||||
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, | ||||||
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, | ||||||
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), | ||||||
compatible_hash=True): | ||||||
def __init__( | ||||||
self, | ||||||
sentences=None, | ||||||
corpus_file=None, | ||||||
sg=0, | ||||||
hs=0, | ||||||
size=100, | ||||||
alpha=0.025, | ||||||
window=5, | ||||||
min_count=5, | ||||||
max_vocab_size=None, | ||||||
max_final_vocab=None, | ||||||
word_ngrams=1, | ||||||
sample=1e-3, | ||||||
seed=1, | ||||||
workers=3, | ||||||
min_alpha=0.0001, | ||||||
negative=5, | ||||||
ns_exponent=0.75, | ||||||
cbow_mean=1, | ||||||
hashfxn=hash, | ||||||
iter=5, | ||||||
null_word=0, | ||||||
min_n=3, | ||||||
max_n=6, | ||||||
sorted_vocab=1, | ||||||
bucket=2000000, | ||||||
trim_rule=None, | ||||||
batch_words=MAX_WORDS_IN_BATCH, | ||||||
callbacks=(), | ||||||
compatible_hash=True | ||||||
): | ||||||
""" | ||||||
|
||||||
Parameters | ||||||
|
@@ -507,6 +534,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha | |||||
Limits the RAM during vocabulary building; if there are more unique | ||||||
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. | ||||||
Set to `None` for no limit. | ||||||
max_final_vocab : int, optional | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's worth explaining the difference between this and |
||||||
Prunes the final vocabulary to this number of word types. | ||||||
|
||||||
Set to `None` to disable. | ||||||
sample : float, optional | ||||||
The threshold for configuring which higher-frequency words are randomly downsampled, | ||||||
useful range is (0, 1e-5). | ||||||
|
@@ -589,8 +620,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha | |||||
|
||||||
self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash) | ||||||
self.vocabulary = FastTextVocab( | ||||||
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, | ||||||
sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent) | ||||||
max_vocab_size=max_vocab_size, | ||||||
max_final_vocab=max_final_vocab, | ||||||
min_count=min_count, | ||||||
sample=sample, | ||||||
sorted_vocab=bool(sorted_vocab), | ||||||
null_word=null_word, | ||||||
ns_exponent=ns_exponent | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Future-proof your code. If we want to add more parameters to end of this list, we won't have to change the last line.
Suggested change
|
||||||
) | ||||||
self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) | ||||||
self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary) | ||||||
self.wv.bucket = self.trainables.bucket | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this parameter to the end of the list. We don't want to break backward compatibility for people who insist on passing keyword arguments as unnamed.
Also, please avoid mixing in innocuous formatting changes with actual functionality, as it makes your code more difficult to review. Roll back this formatting change in the constructor.