Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix method estimate_memory from gensim.models.FastText & huge performance improvement. Fix #1824 #1916

Merged
merged 22 commits into from
Mar 1, 2018
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3db9c63
Cythonize fasttext.ft_hash for 100x performance improvement
jbaiter Jan 3, 2018
9f3428a
Cythonize fasttext.compute_ngrams for 2x performance improvement
jbaiter Jan 3, 2018
51a1a6e
Reduce fasttext memory usage by computing ngrams on the fly
jbaiter Jan 3, 2018
f467ab9
Fix compute_ngrams for Python 2
jbaiter Feb 22, 2018
783114a
Merge branch 'develop' into fasttext-optimization
jbaiter Feb 22, 2018
5c576ad
Store OOV vec in variable for more informative assertion error in tes…
jbaiter Feb 22, 2018
0a5912c
Revert all changes to fasttext_wrapper
jbaiter Feb 22, 2018
9a36b08
Fix indentation for multi-line expressions
jbaiter Feb 23, 2018
764071b
Rename utils_any2vec_fast to _utils_any2vec
jbaiter Feb 23, 2018
722cdda
Merge remote-tracking branch 'upstream/develop' into fasttext-optimiz…
jbaiter Feb 23, 2018
c6f347e
fasttext: Cache ngram buckets for words during training
jbaiter Feb 26, 2018
1d86111
Remove last occurences of wv.ngrams_word and wv.ngrams
jbaiter Feb 26, 2018
6aaab0a
fasttext: use buckets_word cache also for non-Cython training
jbaiter Feb 28, 2018
85679ed
fasttext: Add buckets_ngram size to memory estimate
jbaiter Feb 28, 2018
e574e90
fasttext: Don't store buckets_word with the model
jbaiter Feb 28, 2018
2a090c6
fasttext: Use smaller model for test_estimate_memory
jbaiter Feb 28, 2018
33968dc
fasttext: Fix pure python training code
jbaiter Feb 28, 2018
76a0675
fasttext: Fix asserts for test_estimate_memory
jbaiter Feb 28, 2018
0a2ae3c
fasttext: Fix typo and style errors
jbaiter Feb 28, 2018
0fe0f80
fasttext: Simplify code as per @jayantj's review
jbaiter Feb 28, 2018
7cb46e3
Update MANIFEST.in and documentation with utils_any2vec implementations
jbaiter Feb 28, 2018
dcc0857
last fixes (add option for cython compiler, fix descriptions, etc)
menshikh-iv Mar 1, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,070 changes: 3,070 additions & 0 deletions gensim/models/_utils_any2vec.c

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions gensim/models/_utils_any2vec.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# coding: utf-8

cpdef ft_hash(unicode string):
cdef unsigned int h = 2166136261
for c in string:
h ^= ord(c)
h *= 16777619
return h


cpdef compute_ngrams(word, unsigned int min_n, unsigned int max_n):
cdef unicode extended_word = f'<{word}>'
ngrams = []
for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
for i in range(0, len(extended_word) - ngram_length + 1):
ngrams.append(extended_word[i:i + ngram_length])
return ngrams
132 changes: 73 additions & 59 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
from gensim.models.base_any2vec import BaseWordEmbeddingsModel
from gensim.models.utils_any2vec import _compute_ngrams, _ft_hash

from six import iteritems
from gensim.utils import deprecated, call_on_class_only
from gensim import utils

Expand Down Expand Up @@ -93,10 +92,11 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):

for index in word2_indices:
vocab_subwords_indices += [index]
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]
word2_subwords += _compute_ngrams(model.wv.index2word[index], model.min_n, model.max_n)

for subword in word2_subwords:
ngrams_subwords_indices.append(model.wv.ngrams[subword])
ngrams_subwords_indices.append(
model.wv.hash2index[_ft_hash(subword) % model.bucket])

l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size
Expand Down Expand Up @@ -144,10 +144,10 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
start = max(0, pos - model.window + reduced_window)

subwords_indices = [word.index]
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]
word2_subwords = _compute_ngrams(model.wv.index2word[word.index], model.min_n, model.max_n)

for subword in word2_subwords:
subwords_indices.append(model.wv.ngrams[subword])
subwords_indices.append(model.wv.hash2index[_ft_hash(subword) % model.bucket])

for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
if pos2 != pos: # don't train on the `word` itself
Expand Down Expand Up @@ -278,6 +278,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
sorted_vocab=bool(sorted_vocab), null_word=null_word)
self.trainables = FastTextTrainables(
vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
self.wv.bucket = self.bucket

super(FastText, self).__init__(
sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
Expand Down Expand Up @@ -396,6 +397,37 @@ def _clear_post_train(self):
self.wv.vectors_vocab_norm = None
self.wv.vectors_ngrams_norm = None

def estimate_memory(self, vocab_size=None, report=None):
vocab_size = vocab_size or len(self.wv.vocab)
vec_size = self.vector_size * np.dtype(np.float32).itemsize
l1_size = self.layer1_size * np.dtype(np.float32).itemsize
report = report or {}
report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500)
report['syn0_vocab'] = len(self.wv.vocab) * vec_size
num_buckets = self.bucket
if self.hs:
report['syn1'] = len(self.wv.vocab) * l1_size
if self.negative:
report['syn1neg'] = len(self.wv.vocab) * l1_size
if self.word_ngrams > 0 and self.wv.vocab:
buckets = set()
for word in self.wv.vocab:
ngrams = _compute_ngrams(word, self.min_n, self.max_n)
buckets.update(_ft_hash(ng) % self.bucket for ng in ngrams)
num_buckets = len(buckets)
report['syn0_ngrams'] = len(buckets) * vec_size
elif self.word_ngrams > 0:
logger.warn(
'subword information is enabled, but no vocabulary could be found, estimated required memory might be '
'inaccurate!'
)
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
len(self.wv.vocab), num_buckets, self.vector_size, report['total']
)
return report

def _do_train_job(self, sentences, alpha, inits):
"""Train a single batch of sentences. Return 2-tuple `(effective word count after
ignoring unknown words and sentence length trimming, total word count)`.
Expand Down Expand Up @@ -580,6 +612,7 @@ def _load_model_params(self, file_handle):
self.hs = loss == 1
self.sg = model == 2
self.trainables.bucket = bucket
self.wv.bucket = bucket
self.wv.min_n = minn
self.wv.max_n = maxn
self.vocabulary.sample = t
Expand Down Expand Up @@ -709,18 +742,8 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr
report_values = super(FastTextVocab, self).prepare_vocab(
hs, negative, wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
min_count=min_count, sample=sample, dry_run=dry_run)
self.build_ngrams(wv, update=update)
return report_values

def build_ngrams(self, wv, update=False):
if not update:
wv.ngrams_word = {}
for w, v in iteritems(wv.vocab):
wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)
else:
for w, v in iteritems(wv.vocab):
wv.ngrams_word[w] = _compute_ngrams(w, wv.min_n, wv.max_n)


class FastTextTrainables(Word2VecTrainables):
def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000):
Expand All @@ -744,54 +767,43 @@ def init_ngrams_weights(self, wv, update=False, vocabulary=None):

"""
if not update:
wv.ngrams = {}
wv.vectors_vocab = empty((len(wv.vocab), wv.vector_size), dtype=REAL)
self.vectors_vocab_lockf = ones((len(wv.vocab), wv.vector_size), dtype=REAL)

wv.vectors_ngrams = empty((self.bucket, wv.vector_size), dtype=REAL)
self.vectors_ngrams_lockf = ones((self.bucket, wv.vector_size), dtype=REAL)

all_ngrams = []
for w, ngrams in iteritems(wv.ngrams_word):
all_ngrams += ngrams

all_ngrams = list(set(all_ngrams))
wv.num_ngram_vectors = len(all_ngrams)
logger.info("Total number of ngrams is %d", len(all_ngrams))

wv.hash2index = {}
ngram_indices = []
new_hash_count = 0
for i, ngram in enumerate(all_ngrams):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash in wv.hash2index:
wv.ngrams[ngram] = wv.hash2index[ngram_hash]
else:
ngram_indices.append(ngram_hash % self.bucket)
wv.hash2index[ngram_hash] = new_hash_count
wv.ngrams[ngram] = wv.hash2index[ngram_hash]
new_hash_count = new_hash_count + 1
wv.num_ngram_vectors = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could probably reduce some variables here - there seems to be some redundancy, if I understand correctly. wv.num_ngram_vectors, new_hash_count and len(ngram_indices) serve effectively the same purpose.
Maybe we could use len(ngram_indices) within the loop and set wv.num_ngram_vectors at the end of the loop?

for word in wv.vocab.keys():
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash not in wv.hash2index:
wv.num_ngram_vectors += 1
ngram_indices.append(ngram_hash)
wv.hash2index[ngram_hash] = new_hash_count
new_hash_count = new_hash_count + 1

logger.info("Total number of ngrams is %d", wv.num_ngram_vectors)

wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)
self.vectors_ngrams_lockf = self.vectors_ngrams_lockf.take(ngram_indices, axis=0)
self.reset_ngrams_weights(wv)
else:
new_ngrams = []
for w, ngrams in iteritems(wv.ngrams_word):
new_ngrams += [ng for ng in ngrams if ng not in wv.ngrams]

new_ngrams = list(set(new_ngrams))
wv.num_ngram_vectors += len(new_ngrams)
logger.info("Number of new ngrams is %d", len(new_ngrams))
new_hash_count = 0
for i, ngram in enumerate(new_ngrams):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash not in wv.hash2index:
wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len
wv.ngrams[ngram] = wv.hash2index[ngram_hash]
new_hash_count = new_hash_count + 1
else:
wv.ngrams[ngram] = wv.hash2index[ngram_hash]
num_new_ngrams = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There seems to be some redundancy again with new_hash_count, num_new_ngrams.

for word in wv.vocab.keys():
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash not in wv.hash2index:
wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len
new_hash_count = new_hash_count + 1
num_new_ngrams += 1

wv.num_ngram_vectors += num_new_ngrams
logger.info("Number of new ngrams is %d", num_new_ngrams)

rand_obj = np.random
rand_obj.seed(self.seed)
Expand Down Expand Up @@ -833,10 +845,10 @@ def get_vocab_word_vecs(self, wv):
"""Calculate vectors for words in vocabulary and stores them in `vectors`."""
for w, v in wv.vocab.items():
word_vec = np.copy(wv.vectors_vocab[v.index])
ngrams = wv.ngrams_word[w]
ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
ngram_weights = wv.vectors_ngrams
for ngram in ngrams:
word_vec += ngram_weights[wv.ngrams[ngram]]
word_vec += ngram_weights[wv.hash2index[_ft_hash(ngram) % self.bucket]]
word_vec /= (len(ngrams) + 1)
wv.vectors[v.index] = word_vec

Expand All @@ -847,20 +859,21 @@ def init_ngrams_post_load(self, file_name, wv):
vectors are discarded here to save space.

"""
all_ngrams = []
wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL)

for w, vocab in wv.vocab.items():
all_ngrams += _compute_ngrams(w, wv.min_n, wv.max_n)
wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index])

all_ngrams = set(all_ngrams)
wv.num_ngram_vectors = len(all_ngrams)
ngram_indices = []
for i, ngram in enumerate(all_ngrams):
ngram_hash = _ft_hash(ngram)
ngram_indices.append(len(wv.vocab) + ngram_hash % self.bucket)
wv.ngrams[ngram] = i
wv.num_ngram_vectors = 0
for word in wv.vocab.keys():
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash in wv.hash2index:
continue
ngram_indices.append(len(wv.vocab) + ngram_hash)
wv.hash2index[ngram_hash] = wv.num_ngram_vectors
wv.num_ngram_vectors += 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be set to len(ngram_indices) at the end instead (sorry for nitpicking, but we already have very long code for some of these methods)

wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)

ngram_weights = wv.vectors_ngrams
Expand All @@ -873,7 +886,8 @@ def init_ngrams_post_load(self, file_name, wv):
for w, vocab in wv.vocab.items():
word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
for word_ngram in word_ngrams:
wv.vectors[vocab.index] += np.array(ngram_weights[wv.ngrams[word_ngram]])
vec_idx = wv.hash2index[_ft_hash(word_ngram) % self.bucket]
wv.vectors[vocab.index] += np.array(ngram_weights[vec_idx])

wv.vectors[vocab.index] /= (len(word_ngrams) + 1)
logger.info(
Expand Down
Loading