-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix method estimate_memory
from gensim.models.FastText
& huge performance improvement. Fix #1824
#1916
Fix method estimate_memory
from gensim.models.FastText
& huge performance improvement. Fix #1824
#1916
Changes from 10 commits
3db9c63
9f3428a
51a1a6e
f467ab9
783114a
5c576ad
0a5912c
9a36b08
764071b
722cdda
c6f347e
1d86111
6aaab0a
85679ed
e574e90
2a090c6
33968dc
76a0675
0a2ae3c
0fe0f80
7cb46e3
dcc0857
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env cython | ||
# cython: boundscheck=False | ||
# cython: wraparound=False | ||
# cython: cdivision=True | ||
# coding: utf-8 | ||
|
||
cpdef ft_hash(unicode string): | ||
cdef unsigned int h = 2166136261 | ||
for c in string: | ||
h ^= ord(c) | ||
h *= 16777619 | ||
return h | ||
|
||
|
||
cpdef compute_ngrams(word, unsigned int min_n, unsigned int max_n): | ||
cdef unicode extended_word = f'<{word}>' | ||
ngrams = [] | ||
for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): | ||
for i in range(0, len(extended_word) - ngram_length + 1): | ||
ngrams.append(extended_word[i:i + ngram_length]) | ||
return ngrams |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,7 +39,6 @@ | |
from gensim.models.base_any2vec import BaseWordEmbeddingsModel | ||
from gensim.models.utils_any2vec import _compute_ngrams, _ft_hash | ||
|
||
from six import iteritems | ||
from gensim.utils import deprecated, call_on_class_only | ||
from gensim import utils | ||
|
||
|
@@ -93,10 +92,11 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): | |
|
||
for index in word2_indices: | ||
vocab_subwords_indices += [index] | ||
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] | ||
word2_subwords += _compute_ngrams(model.wv.index2word[index], model.min_n, model.max_n) | ||
|
||
for subword in word2_subwords: | ||
ngrams_subwords_indices.append(model.wv.ngrams[subword]) | ||
ngrams_subwords_indices.append( | ||
model.wv.hash2index[_ft_hash(subword) % model.bucket]) | ||
|
||
l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size | ||
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size | ||
|
@@ -144,10 +144,10 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None): | |
start = max(0, pos - model.window + reduced_window) | ||
|
||
subwords_indices = [word.index] | ||
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] | ||
word2_subwords = _compute_ngrams(model.wv.index2word[word.index], model.min_n, model.max_n) | ||
|
||
for subword in word2_subwords: | ||
subwords_indices.append(model.wv.ngrams[subword]) | ||
subwords_indices.append(model.wv.hash2index[_ft_hash(subword) % model.bucket]) | ||
|
||
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): | ||
if pos2 != pos: # don't train on the `word` itself | ||
|
@@ -278,6 +278,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, | |
sorted_vocab=bool(sorted_vocab), null_word=null_word) | ||
self.trainables = FastTextTrainables( | ||
vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) | ||
self.wv.bucket = self.bucket | ||
|
||
super(FastText, self).__init__( | ||
sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, | ||
|
@@ -396,6 +397,37 @@ def _clear_post_train(self): | |
self.wv.vectors_vocab_norm = None | ||
self.wv.vectors_ngrams_norm = None | ||
|
||
def estimate_memory(self, vocab_size=None, report=None): | ||
vocab_size = vocab_size or len(self.wv.vocab) | ||
vec_size = self.vector_size * np.dtype(np.float32).itemsize | ||
l1_size = self.layer1_size * np.dtype(np.float32).itemsize | ||
report = report or {} | ||
report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500) | ||
report['syn0_vocab'] = len(self.wv.vocab) * vec_size | ||
num_buckets = self.bucket | ||
if self.hs: | ||
report['syn1'] = len(self.wv.vocab) * l1_size | ||
if self.negative: | ||
report['syn1neg'] = len(self.wv.vocab) * l1_size | ||
if self.word_ngrams > 0 and self.wv.vocab: | ||
buckets = set() | ||
for word in self.wv.vocab: | ||
ngrams = _compute_ngrams(word, self.min_n, self.max_n) | ||
buckets.update(_ft_hash(ng) % self.bucket for ng in ngrams) | ||
num_buckets = len(buckets) | ||
report['syn0_ngrams'] = len(buckets) * vec_size | ||
elif self.word_ngrams > 0: | ||
logger.warn( | ||
'subword information is enabled, but no vocabulary could be found, estimated required memory might be ' | ||
'inaccurate!' | ||
) | ||
report['total'] = sum(report.values()) | ||
logger.info( | ||
"estimated required memory for %i words, %i buckets and %i dimensions: %i bytes", | ||
len(self.wv.vocab), num_buckets, self.vector_size, report['total'] | ||
) | ||
return report | ||
|
||
def _do_train_job(self, sentences, alpha, inits): | ||
"""Train a single batch of sentences. Return 2-tuple `(effective word count after | ||
ignoring unknown words and sentence length trimming, total word count)`. | ||
|
@@ -580,6 +612,7 @@ def _load_model_params(self, file_handle): | |
self.hs = loss == 1 | ||
self.sg = model == 2 | ||
self.trainables.bucket = bucket | ||
self.wv.bucket = bucket | ||
self.wv.min_n = minn | ||
self.wv.max_n = maxn | ||
self.vocabulary.sample = t | ||
|
@@ -709,18 +742,8 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr | |
report_values = super(FastTextVocab, self).prepare_vocab( | ||
hs, negative, wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, | ||
min_count=min_count, sample=sample, dry_run=dry_run) | ||
self.build_ngrams(wv, update=update) | ||
return report_values | ||
|
||
def build_ngrams(self, wv, update=False): | ||
if not update: | ||
wv.ngrams_word = {} | ||
for w, v in iteritems(wv.vocab): | ||
wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n) | ||
else: | ||
for w, v in iteritems(wv.vocab): | ||
wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n) | ||
|
||
|
||
class FastTextTrainables(Word2VecTrainables): | ||
def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): | ||
|
@@ -744,54 +767,43 @@ def init_ngrams_weights(self, wv, update=False, vocabulary=None): | |
|
||
""" | ||
if not update: | ||
wv.ngrams = {} | ||
wv.vectors_vocab = empty((len(wv.vocab), wv.vector_size), dtype=REAL) | ||
self.vectors_vocab_lockf = ones((len(wv.vocab), wv.vector_size), dtype=REAL) | ||
|
||
wv.vectors_ngrams = empty((self.bucket, wv.vector_size), dtype=REAL) | ||
self.vectors_ngrams_lockf = ones((self.bucket, wv.vector_size), dtype=REAL) | ||
|
||
all_ngrams = [] | ||
for w, ngrams in iteritems(wv.ngrams_word): | ||
all_ngrams += ngrams | ||
|
||
all_ngrams = list(set(all_ngrams)) | ||
wv.num_ngram_vectors = len(all_ngrams) | ||
logger.info("Total number of ngrams is %d", len(all_ngrams)) | ||
|
||
wv.hash2index = {} | ||
ngram_indices = [] | ||
new_hash_count = 0 | ||
for i, ngram in enumerate(all_ngrams): | ||
ngram_hash = _ft_hash(ngram) % self.bucket | ||
if ngram_hash in wv.hash2index: | ||
wv.ngrams[ngram] = wv.hash2index[ngram_hash] | ||
else: | ||
ngram_indices.append(ngram_hash % self.bucket) | ||
wv.hash2index[ngram_hash] = new_hash_count | ||
wv.ngrams[ngram] = wv.hash2index[ngram_hash] | ||
new_hash_count = new_hash_count + 1 | ||
wv.num_ngram_vectors = 0 | ||
for word in wv.vocab.keys(): | ||
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n): | ||
ngram_hash = _ft_hash(ngram) % self.bucket | ||
if ngram_hash not in wv.hash2index: | ||
wv.num_ngram_vectors += 1 | ||
ngram_indices.append(ngram_hash) | ||
wv.hash2index[ngram_hash] = new_hash_count | ||
new_hash_count = new_hash_count + 1 | ||
|
||
logger.info("Total number of ngrams is %d", wv.num_ngram_vectors) | ||
|
||
wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0) | ||
self.vectors_ngrams_lockf = self.vectors_ngrams_lockf.take(ngram_indices, axis=0) | ||
self.reset_ngrams_weights(wv) | ||
else: | ||
new_ngrams = [] | ||
for w, ngrams in iteritems(wv.ngrams_word): | ||
new_ngrams += [ng for ng in ngrams if ng not in wv.ngrams] | ||
|
||
new_ngrams = list(set(new_ngrams)) | ||
wv.num_ngram_vectors += len(new_ngrams) | ||
logger.info("Number of new ngrams is %d", len(new_ngrams)) | ||
new_hash_count = 0 | ||
for i, ngram in enumerate(new_ngrams): | ||
ngram_hash = _ft_hash(ngram) % self.bucket | ||
if ngram_hash not in wv.hash2index: | ||
wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len | ||
wv.ngrams[ngram] = wv.hash2index[ngram_hash] | ||
new_hash_count = new_hash_count + 1 | ||
else: | ||
wv.ngrams[ngram] = wv.hash2index[ngram_hash] | ||
num_new_ngrams = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There seems to be some redundancy again with |
||
for word in wv.vocab.keys(): | ||
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n): | ||
ngram_hash = _ft_hash(ngram) % self.bucket | ||
if ngram_hash not in wv.hash2index: | ||
wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len | ||
new_hash_count = new_hash_count + 1 | ||
num_new_ngrams += 1 | ||
|
||
wv.num_ngram_vectors += num_new_ngrams | ||
logger.info("Number of new ngrams is %d", num_new_ngrams) | ||
|
||
rand_obj = np.random | ||
rand_obj.seed(self.seed) | ||
|
@@ -833,10 +845,10 @@ def get_vocab_word_vecs(self, wv): | |
"""Calculate vectors for words in vocabulary and stores them in `vectors`.""" | ||
for w, v in wv.vocab.items(): | ||
word_vec = np.copy(wv.vectors_vocab[v.index]) | ||
ngrams = wv.ngrams_word[w] | ||
ngrams = _compute_ngrams(w, wv.min_n, wv.max_n) | ||
ngram_weights = wv.vectors_ngrams | ||
for ngram in ngrams: | ||
word_vec += ngram_weights[wv.ngrams[ngram]] | ||
word_vec += ngram_weights[wv.hash2index[_ft_hash(ngram) % self.bucket]] | ||
word_vec /= (len(ngrams) + 1) | ||
wv.vectors[v.index] = word_vec | ||
|
||
|
@@ -847,20 +859,21 @@ def init_ngrams_post_load(self, file_name, wv): | |
vectors are discarded here to save space. | ||
|
||
""" | ||
all_ngrams = [] | ||
wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL) | ||
|
||
for w, vocab in wv.vocab.items(): | ||
all_ngrams += _compute_ngrams(w, wv.min_n, wv.max_n) | ||
wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index]) | ||
|
||
all_ngrams = set(all_ngrams) | ||
wv.num_ngram_vectors = len(all_ngrams) | ||
ngram_indices = [] | ||
for i, ngram in enumerate(all_ngrams): | ||
ngram_hash = _ft_hash(ngram) | ||
ngram_indices.append(len(wv.vocab) + ngram_hash % self.bucket) | ||
wv.ngrams[ngram] = i | ||
wv.num_ngram_vectors = 0 | ||
for word in wv.vocab.keys(): | ||
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n): | ||
ngram_hash = _ft_hash(ngram) % self.bucket | ||
if ngram_hash in wv.hash2index: | ||
continue | ||
ngram_indices.append(len(wv.vocab) + ngram_hash) | ||
wv.hash2index[ngram_hash] = wv.num_ngram_vectors | ||
wv.num_ngram_vectors += 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can be set to |
||
wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0) | ||
|
||
ngram_weights = wv.vectors_ngrams | ||
|
@@ -873,7 +886,8 @@ def init_ngrams_post_load(self, file_name, wv): | |
for w, vocab in wv.vocab.items(): | ||
word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n) | ||
for word_ngram in word_ngrams: | ||
wv.vectors[vocab.index] += np.array(ngram_weights[wv.ngrams[word_ngram]]) | ||
vec_idx = wv.hash2index[_ft_hash(word_ngram) % self.bucket] | ||
wv.vectors[vocab.index] += np.array(ngram_weights[vec_idx]) | ||
|
||
wv.vectors[vocab.index] /= (len(word_ngrams) + 1) | ||
logger.info( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could probably reduce some variables here - there seems to be some redundancy, if I understand correctly.
wv.num_ngram_vectors
,new_hash_count
andlen(ngram_indices)
serve effectively the same purpose.Maybe we could use
len(ngram_indices)
within the loop and setwv.num_ngram_vectors
at the end of the loop?