From 2bc36139a60ee990de9d3aa43f378a00fee15a90 Mon Sep 17 00:00:00 2001 From: Cristi Burca Date: Fri, 31 May 2019 16:08:33 +0100 Subject: [PATCH 1/2] Expose max_final_vocab parameter in FastText constructor --- gensim/models/fasttext.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 4739534612..29714cacb0 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -461,7 +461,7 @@ class FastText(BaseWordEmbeddingsModel): """ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + max_vocab_size=None, max_final_vocab=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), compatible_hash=True): @@ -507,6 +507,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. + max_final_vocab : int, optional + Prunes the final vocabulary to this number of word types. + + Set to `None` to disable. sample : float, optional The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). @@ -589,8 +593,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash) self.vocabulary = FastTextVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent) + max_vocab_size=max_vocab_size, + max_final_vocab=max_final_vocab, + min_count=min_count, + sample=sample, + sorted_vocab=bool(sorted_vocab), + null_word=null_word, + ns_exponent=ns_exponent + ) self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary) self.wv.bucket = self.trainables.bucket From c16f5b42ad05795f2cf3e2ffa5e94b4d6a69963f Mon Sep 17 00:00:00 2001 From: Cristi Burca Date: Fri, 31 May 2019 16:47:12 +0100 Subject: [PATCH 2/2] Fix lint error --- gensim/models/fasttext.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 29714cacb0..d9777055aa 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -460,11 +460,38 @@ class FastText(BaseWordEmbeddingsModel): for the internal structure of words, besides their concurrence counts. """ - def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, max_final_vocab=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, - sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), - compatible_hash=True): + def __init__( + self, + sentences=None, + corpus_file=None, + sg=0, + hs=0, + size=100, + alpha=0.025, + window=5, + min_count=5, + max_vocab_size=None, + max_final_vocab=None, + word_ngrams=1, + sample=1e-3, + seed=1, + workers=3, + min_alpha=0.0001, + negative=5, + ns_exponent=0.75, + cbow_mean=1, + hashfxn=hash, + iter=5, + null_word=0, + min_n=3, + max_n=6, + sorted_vocab=1, + bucket=2000000, + trim_rule=None, + batch_words=MAX_WORDS_IN_BATCH, + callbacks=(), + compatible_hash=True + ): """ Parameters