From 18f53024707c62e0ecd63a823fb4021e993669b1 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 24 Jan 2019 23:13:36 +1100 Subject: [PATCH 01/39] WIP: doco improvements doctest complains about vocabulary and training continuation --- gensim/models/_fasttext_bin.py | 3 ++ gensim/models/fasttext.py | 82 ++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 3 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 31d85c5074..61c8a63f93 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- """Load models from the native binary format released by Facebook. +The main entry point is the :py:func:`load` function. +It returns a :py:class:`Model` namedtuple containing everything loaded from the binary. + Examples -------- diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 1b03268442..e534051af8 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -14,6 +14,9 @@ This module contains a fast native C implementation of Fasttext with Python interfaces. It is **not** only a wrapper around Facebook's implementation. +This module supports loading models trained with Facebook's fastText implementation. +It also supports continuing training from such models. + For a tutorial see `this notebook `_. @@ -31,6 +34,15 @@ >>> from gensim.models import FastText >>> >>> model = FastText(common_texts, size=4, window=3, min_count=1, iter=10) + >>> sentences = [ + ... ['computer', 'artificial', 'intelligence'], + ... ['artificial', 'trees'], + ... ['human', 'intelligence'], + ... ['artificial', 'graph'], + ... ['intelligence'], + ... ['artificial', 'intelligence', 'system'] + ... ] + >>> model.train(sentences, total_examples=len(sentences), epochs=model.epochs) Persist a model to disk with: @@ -41,7 +53,49 @@ >>> fname = get_tmpfile("fasttext.model") >>> >>> model.save(fname) - >>> model = FastText.load(fname) # you can continue training with the loaded model! + >>> model = FastText.load(fname) + +Once loaded, such models behave identically to those created from scratch. +For example, you can continue training the loaded model: + + >>> new_sentences = [ + ... ['sweet', 'child', 'of', 'mine'], + ... ['rocket', 'queen'], + ... ['you', 'could', 'be', 'mine'], + ... ['november', 'rain'], + ... ] + >>> 'rocket' in model.wv + False + >>> model.train(new_sentences, total_examples=len(sentences), epochs=model.epochs) + >>> 'rocket' in model.wv + True + +You can also load models trained with Facebook's fastText implementation: + +.. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> cap_path = datapath("crime-and-punishment.bin") + >>> # Partial model: loads quickly, uses less RAM, but cannot continue training + >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) + >>> # Full model: loads slowly, consumes RAM, but can continue training (see below) + >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True) + +Once loaded, such models behave identically to those trained from scratch. +You may continue training them on new data: + +.. sourcecode:: pycon + + >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab + False + >>> 'rocket' in fb_full.wv.vocab + False + >>> fb_full.train(sentences, total_examples=len(sentences), epochs=model.epochs) + >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) + >>> 'computer' in fb_full.wv.vocab # We have learned this word now + True + >>> 'rocket' in fb_full.wv.vocab + True Retrieve word-vector for vocab and out-of-vocab word: @@ -85,6 +139,27 @@ >>> analogies_result = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) +Implementation Notes +-------------------- + +Our FastText implementation is split across several submodules: + +- :py:mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. +- :py:mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality. +- :py:mod:`gensim.models.word2vec`: +- :py:mod:`gensim.models.base_any2vec`: +- :py:mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions. + +Our implementation relies heavily on inheritance. +It consists of several important classes: + +- :py:class:`FastTextVocab`: the vocabulary. +- :py:class:`gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. + Once training is complete, this class is sufficient for calculating embeddings. +- :py:class:`FastTextTrainables`: the underlying neural network. The implementation + uses this class to *learn* the word embeddings. +- :py:class:`FastText`: ties everything together. + """ import logging @@ -759,7 +834,8 @@ def load_fasttext_format(cls, model_file, encoding='utf8'): Notes ------ - Due to limitations in the FastText API, you cannot continue training with a model loaded this way. + This function effectively ignores `.vec` output file. + It only needs the `.bin` file. Parameters ---------- @@ -773,7 +849,7 @@ def load_fasttext_format(cls, model_file, encoding='utf8'): Returns ------- - :class: `~gensim.models.fasttext.FastText` + gensim.models.fasttext.FastText The loaded model. """ From b57a08628f765315b4552dff4f68941030c747d3 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 25 Jan 2019 23:13:14 +1100 Subject: [PATCH 02/39] more doco --- gensim/models/fasttext.py | 42 ++++++++++++++++++++++++++++++----- gensim/models/keyedvectors.py | 10 +++++++-- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index e534051af8..f6440e665e 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -142,6 +142,7 @@ Implementation Notes -------------------- +These notes may help developers navigate our fastText implementation. Our FastText implementation is split across several submodules: - :py:mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. @@ -153,8 +154,8 @@ Our implementation relies heavily on inheritance. It consists of several important classes: -- :py:class:`FastTextVocab`: the vocabulary. -- :py:class:`gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. +- :py:class:`FastTextVocab`: the vocabulary. Redundant, simply wraps its superclass. +- :py:class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. Once training is complete, this class is sufficient for calculating embeddings. - :py:class:`FastTextTrainables`: the underlying neural network. The implementation uses this class to *learn* the word embeddings. @@ -938,15 +939,44 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) -# -# Keep for backward compatibility. -# class FastTextVocab(Word2VecVocab): + """This is a redundant class. It exists only to maintain backwards compatibility + with older gensim versions.""" pass class FastTextTrainables(Word2VecTrainables): - """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`.""" + """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`. + + Mostly inherits from its parent (:py:class:`gensim.models.word2vec.Word2VecTrainables`). + Adds logic for calculating and maintaining ngram weights. + + Attributes + ---------- + + hashfxn : function + Used for randomly initializing weights. Defaults to the built-in hash() + layer1_size : int + The size of the inner layer of the NN. Equal to the vector dimensionality. Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor. + seed : float + The random generator seed used in reset_weights and update_weights + syn1 : numpy.array + The inner layer of the NN. Each row corresponds to a term in the vocabulary. Columns correspond to weights of the inner layer. There are layer1_size such weights. Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. + syn1neg : numpy.array + Similar to syn1, but only set if negative sampling is used. + vectors_lockf : numpy.array + A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones. + vectors_vocab_lockf : numpy.array + Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL) + vectors_ngrams_lockf : numpy.array + np.ones((self.bucket, wv.vector_size), dtype=REAL) + + Notes + ----- + + The lockf stuff looks like it gets used by the fast C implementation. + + """ def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): super(FastTextTrainables, self).__init__( vector_size=vector_size, seed=seed, hashfxn=hashfxn) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d9dad1cc56..935a53967a 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1936,14 +1936,19 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors): If True, uses the Facebook-compatible hash function instead of the Gensim backwards-compatible hash function. + Some important attributes: + Attributes ---------- vectors_vocab : np.array - A vector for each entity in the vocabulary. + Each row corresponds to a vector for an entity in the vocabulary. + Columns correspond to vector dimensions. vectors_vocab_norm : np.array Same as vectors_vocab, but the vectors are L2 normalized. vectors_ngrams : np.array A vector for each ngram across all entities in the vocabulary. + Each row is a vector that corresponds to a bucket. + Columns correspond to vector dimensions. vectors_ngrams_norm : np.array Same as vectors_ngrams, but the vectors are L2 normalized. Under some conditions, may actually be the same matrix as @@ -1957,7 +1962,8 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors): bucket to an index, and then indexing into vectors_ngrams (in other words, vectors_ngrams[hash2index[hash_fn(ngram) % bucket]]. num_ngram_vectors : int - TODO + The number of vectors that correspond to ngrams, as opposed to terms + (full words). """ def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash): From d66f55c229a03dcb27ada81124496d8f302cc480 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 25 Jan 2019 23:46:32 +1100 Subject: [PATCH 03/39] flake8-docs updates --- gensim/models/fasttext.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 4abd735bcf..f937ef72d3 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -58,17 +58,19 @@ Once loaded, such models behave identically to those created from scratch. For example, you can continue training the loaded model: +.. sourcecode:: pycon + + >>> import numpy as np + >>> old_computer = model.wv['computer'] # Grab the existing vector for this word >>> new_sentences = [ - ... ['sweet', 'child', 'of', 'mine'], - ... ['rocket', 'queen'], - ... ['you', 'could', 'be', 'mine'], - ... ['november', 'rain'], + ... ['computers', 'expensive'], + ... ['computer', 'chess', 'players', 'stronger', 'than', 'humans'], + ... ['computers', 'are', 'everywhere'], ... ] - >>> 'rocket' in model.wv - False >>> model.train(new_sentences, total_examples=len(sentences), epochs=model.epochs) - >>> 'rocket' in model.wv - True + >>> new_computer = model.wv['computer'] + >>> np.allclose(old_computer, new_computer) + False You can also load models trained with Facebook's fastText implementation: @@ -86,16 +88,15 @@ .. sourcecode:: pycon + >>> import numpy as np >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab False - >>> 'rocket' in fb_full.wv.vocab - False + >>> old_computer = fb_full.wv['computer'] # Calculate current vectors >>> fb_full.train(sentences, total_examples=len(sentences), epochs=model.epochs) >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) - >>> 'computer' in fb_full.wv.vocab # We have learned this word now - True - >>> 'rocket' in fb_full.wv.vocab - True + >>> new_computer = fb_full.wv['computer'] + >>> np.allclose(old_computer, new_computer) # Vector has changed, model has learnt something + False Retrieve word-vector for vocab and out-of-vocab word: From 96966577bf40725676507adf6848e0cee8d23615 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 00:19:42 +1100 Subject: [PATCH 04/39] adding fixmes --- gensim/models/fasttext.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index f937ef72d3..92706fc7fc 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -61,7 +61,7 @@ .. sourcecode:: pycon >>> import numpy as np - >>> old_computer = model.wv['computer'] # Grab the existing vector for this word + >>> old_computer = np.copy(model.wv['computer']) # Grab the existing vector for this word >>> new_sentences = [ ... ['computers', 'expensive'], ... ['computer', 'chess', 'players', 'stronger', 'than', 'humans'], @@ -69,7 +69,8 @@ ... ] >>> model.train(new_sentences, total_examples=len(sentences), epochs=model.epochs) >>> new_computer = model.wv['computer'] - >>> np.allclose(old_computer, new_computer) + >>> # FIXME: why is this True?? + >>> np.allclose(old_computer, new_computer, atol=1e-4) False You can also load models trained with Facebook's fastText implementation: @@ -88,14 +89,14 @@ .. sourcecode:: pycon - >>> import numpy as np >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab False - >>> old_computer = fb_full.wv['computer'] # Calculate current vectors + >>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors >>> fb_full.train(sentences, total_examples=len(sentences), epochs=model.epochs) >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) >>> new_computer = fb_full.wv['computer'] - >>> np.allclose(old_computer, new_computer) # Vector has changed, model has learnt something + >>> # FIXME: why is this True?? + >>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something False Retrieve word-vector for vocab and out-of-vocab word: From 3019bea353b2200de57f02d4fefbe3f023312514 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 13:20:11 +1100 Subject: [PATCH 05/39] minor fixup --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 92706fc7fc..ed950ddc7c 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -67,7 +67,7 @@ ... ['computer', 'chess', 'players', 'stronger', 'than', 'humans'], ... ['computers', 'are', 'everywhere'], ... ] - >>> model.train(new_sentences, total_examples=len(sentences), epochs=model.epochs) + >>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) >>> new_computer = model.wv['computer'] >>> # FIXME: why is this True?? >>> np.allclose(old_computer, new_computer, atol=1e-4) From 74b740c8b19fb24fad4f713664e993fd6bce27e8 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 13:53:40 +1100 Subject: [PATCH 06/39] review response --- gensim/models/fasttext.py | 46 +++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index ed950ddc7c..918fd6d5b4 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -145,21 +145,26 @@ -------------------- These notes may help developers navigate our fastText implementation. -Our FastText implementation is split across several submodules: +The implementation is split across several submodules: -- :py:mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. +- :py:mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. - :py:mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality. -- :py:mod:`gensim.models.word2vec`: -- :py:mod:`gensim.models.base_any2vec`: +- :py:mod:`gensim.models.word2vec`: Contains implementations for the vocabulary + and the trainables for FastText. +- :py:mod:`gensim.models.base_any2vec`: Contains implementations for the base + classes, including functionality such as callbacks, logging. - :py:mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions. +- :py:mod:`gensim.utils`: Implements model I/O (loading and saving) Our implementation relies heavily on inheritance. It consists of several important classes: -- :py:class:`FastTextVocab`: the vocabulary. Redundant, simply wraps its superclass. +- :py:class:`~gensim.models.word2vec.Word2VecVocab`: the vocabulary. + Keeps track of all the unique words, sometimes discarding the extremely rare ones. + This is sometimes called the Dictionary within Gensim. - :py:class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. Once training is complete, this class is sufficient for calculating embeddings. -- :py:class:`FastTextTrainables`: the underlying neural network. The implementation +- :py:class:`FastTextTrainables`: the underlying neural network. The implementation uses this class to *learn* the word embeddings. - :py:class:`FastText`: ties everything together. @@ -613,7 +618,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p def _set_train_params(self, **kwargs): # # We need the wv.buckets_word member to be initialized in order to - # continue training. The _clear_post_train method destroys this + # continue training. The _clear_post_train method destroys this # variable, so we reinitialize it here, if needed. # # The .old_vocab_len and .old_hash2index_len members are set only to @@ -849,7 +854,10 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): Notes ------ - This function effectively ignores `.vec` output file. + Facebook provides both `.vec` and `.bin` files with their modules. + The former contains human-readable vectors. + The latter contains machine-readable vectors along with other model parameters. + This function effectively ignores `.vec` output file, since that file is redundant. It only needs the `.bin` file. Parameters @@ -862,7 +870,7 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): encoding : str, optional Specifies the file encoding. full_model : boolean, optional - If False, skips loading the hidden output matrix. This saves a fair bit + If False, skips loading the hidden output matrix. This saves a fair bit of CPU time and RAM, but prevents training continuation. Returns @@ -935,7 +943,7 @@ def load(cls, *args, **kwargs): if not hasattr(model.wv, 'compatible_hash'): logger.warning( - "This older model was trained with a buggy hash function. " + "This older model was trained with a buggy hash function. " "The model will continue to work, but consider training it " "from scratch." ) @@ -957,7 +965,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse class FastTextVocab(Word2VecVocab): - """This is a redundant class. It exists only to maintain backwards compatibility + """This is a redundant class. It exists only to maintain backwards compatibility with older gensim versions.""" pass @@ -972,17 +980,17 @@ class FastTextTrainables(Word2VecTrainables): ---------- hashfxn : function - Used for randomly initializing weights. Defaults to the built-in hash() + Used for randomly initializing weights. Defaults to the built-in hash() layer1_size : int - The size of the inner layer of the NN. Equal to the vector dimensionality. Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor. + The size of the inner layer of the NN. Equal to the vector dimensionality. Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor. seed : float The random generator seed used in reset_weights and update_weights syn1 : numpy.array - The inner layer of the NN. Each row corresponds to a term in the vocabulary. Columns correspond to weights of the inner layer. There are layer1_size such weights. Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. + The inner layer of the NN. Each row corresponds to a term in the vocabulary. Columns correspond to weights of the inner layer. There are layer1_size such weights. Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. syn1neg : numpy.array Similar to syn1, but only set if negative sampling is used. vectors_lockf : numpy.array - A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones. + A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones. vectors_vocab_lockf : numpy.array Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL) vectors_ngrams_lockf : numpy.array @@ -1007,17 +1015,17 @@ def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): # 2. vectors_ngrams_lockf # # These are both 2D matrices of shapes equal to the shapes of - # wv.vectors_vocab and wv.vectors_ngrams. So, each row corresponds to + # wv.vectors_vocab and wv.vectors_ngrams. So, each row corresponds to # a vector, and each column corresponds to a dimension within that # vector. # # Lockf stands for "lock factor": zero values suppress learning, one - # values enable it. Interestingly, the vectors_vocab_lockf and + # values enable it. Interestingly, the vectors_vocab_lockf and # vectors_ngrams_lockf seem to be used only by the C code in # fasttext_inner.pyx. # # The word2vec implementation also uses vectors_lockf: in that case, - # it's a 1D array, with a real number for each vector. The FastText + # it's a 1D array, with a real number for each vector. The FastText # implementation inherits this vectors_lockf attribute but doesn't # appear to use it. # @@ -1095,7 +1103,7 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): encoding : str, optional Specifies the file encoding. full_model : boolean, optional - If False, skips loading the hidden output matrix. This saves a fair bit + If False, skips loading the hidden output matrix. This saves a fair bit of CPU time and RAM, but prevents training continuation. Returns From 09ab63011d31d92162c9ac2dc2189ba5d60bf436 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 13:54:08 +1100 Subject: [PATCH 07/39] Remove magic constant --- gensim/models/fasttext.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 918fd6d5b4..c062e599a5 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -318,9 +318,6 @@ def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _exp raise RuntimeError("Training with corpus_file argument is not supported") -FASTTEXT_FILEFORMAT_MAGIC = 793712314 - - class FastText(BaseWordEmbeddingsModel): """Train, use and evaluate word representations learned using the method described in `Enriching Word Vectors with Subword Information `_, aka FastText. From 2e728cb4fd70519ffbe05c910d0de240d49e3e07 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 17:13:23 +1100 Subject: [PATCH 08/39] deprecate the iter parameter to the FastText constructor --- gensim/models/fasttext.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index c062e599a5..095ba4fdd7 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -353,7 +353,7 @@ class FastText(BaseWordEmbeddingsModel): """ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, + negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=None, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), compatible_hash=True): """ @@ -416,7 +416,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. iter : int, optional - Number of iterations (epochs) over the corpus. + Deprecated. trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). @@ -471,6 +471,12 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha >>> of_vector = model.wv['of'] # get vector for out-of-vocab word """ + if iter is not None: + logging.warn( + 'The iter parameter is deprecated. Pass the epochs keyword ' + 'parameter to the train method instead.' + ) + self.load = call_on_class_only self.load_fasttext_format = call_on_class_only self.callbacks = callbacks @@ -487,7 +493,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha self.wv.bucket = self.trainables.bucket super(FastText, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, + sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION) From c435a8e62837e34c24ae0fa800be0c41aea173d9 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 17:13:37 +1100 Subject: [PATCH 09/39] minor documentation fixes --- gensim/models/fasttext.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 095ba4fdd7..574feb2ecc 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -985,11 +985,15 @@ class FastTextTrainables(Word2VecTrainables): hashfxn : function Used for randomly initializing weights. Defaults to the built-in hash() layer1_size : int - The size of the inner layer of the NN. Equal to the vector dimensionality. Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor. + The size of the inner layer of the NN. Equal to the vector dimensionality. + Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor. seed : float - The random generator seed used in reset_weights and update_weights + The random generator seed used in reset_weights and update_weights. syn1 : numpy.array - The inner layer of the NN. Each row corresponds to a term in the vocabulary. Columns correspond to weights of the inner layer. There are layer1_size such weights. Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. + The inner layer of the NN. Each row corresponds to a term in the vocabulary. + Columns correspond to weights of the inner layer. + There are layer1_size such weights. + Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. syn1neg : numpy.array Similar to syn1, but only set if negative sampling is used. vectors_lockf : numpy.array @@ -999,11 +1003,6 @@ class FastTextTrainables(Word2VecTrainables): vectors_ngrams_lockf : numpy.array np.ones((self.bucket, wv.vector_size), dtype=REAL) - Notes - ----- - - The lockf stuff looks like it gets used by the fast C implementation. - """ def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): super(FastTextTrainables, self).__init__( From c688877da830090b32eb5e9f0019372f9218b13e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 17:19:18 +1100 Subject: [PATCH 10/39] review response: use absolute references --- gensim/models/fasttext.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 574feb2ecc..135478a449 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -147,26 +147,26 @@ These notes may help developers navigate our fastText implementation. The implementation is split across several submodules: -- :py:mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. -- :py:mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality. -- :py:mod:`gensim.models.word2vec`: Contains implementations for the vocabulary +- :mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. +- :mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality. +- :mod:`gensim.models.word2vec`: Contains implementations for the vocabulary and the trainables for FastText. -- :py:mod:`gensim.models.base_any2vec`: Contains implementations for the base +- :mod:`gensim.models.base_any2vec`: Contains implementations for the base. classes, including functionality such as callbacks, logging. -- :py:mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions. -- :py:mod:`gensim.utils`: Implements model I/O (loading and saving) +- :mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions. +- :mod:`gensim.utils`: Implements model I/O (loading and saving). Our implementation relies heavily on inheritance. It consists of several important classes: -- :py:class:`~gensim.models.word2vec.Word2VecVocab`: the vocabulary. +- :class:`~gensim.models.word2vec.Word2VecVocab`: the vocabulary. Keeps track of all the unique words, sometimes discarding the extremely rare ones. This is sometimes called the Dictionary within Gensim. -- :py:class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. +- :class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors. Once training is complete, this class is sufficient for calculating embeddings. -- :py:class:`FastTextTrainables`: the underlying neural network. The implementation - uses this class to *learn* the word embeddings. -- :py:class:`FastText`: ties everything together. +- :class:`~gensim.models.fasttext.FastTextTrainables`: the underlying neural network. + The implementation uses this class to *learn* the word embeddings. +- :class:`~gensim.models.fasttext.FastText`: ties everything together. """ @@ -976,7 +976,7 @@ class FastTextVocab(Word2VecVocab): class FastTextTrainables(Word2VecTrainables): """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`. - Mostly inherits from its parent (:py:class:`gensim.models.word2vec.Word2VecTrainables`). + Mostly inherits from its parent (:class:`~gensim.models.word2vec.Word2VecTrainables`). Adds logic for calculating and maintaining ngram weights. Attributes @@ -986,7 +986,7 @@ class FastTextTrainables(Word2VecTrainables): Used for randomly initializing weights. Defaults to the built-in hash() layer1_size : int The size of the inner layer of the NN. Equal to the vector dimensionality. - Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor. + Set in the :class:`~gensim.models.word2vec.Word2VecTrainables` constructor. seed : float The random generator seed used in reset_weights and update_weights. syn1 : numpy.array From 677679c309649b0cd889d464a45ea89ad2d2f6ef Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 17:53:29 +1100 Subject: [PATCH 11/39] review response --- gensim/models/_fasttext_bin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 103ce9b8d8..713aafebfc 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """Load models from the native binary format released by Facebook. -The main entry point is the :py:func:`load` function. -It returns a :py:class:`Model` namedtuple containing everything loaded from the binary. +The main entry point is the :func:`~gensim.models._fasttext_bin.load` function. +It returns a :class:`~gensim.models._fasttext_bin.Model` namedtuple containing everything loaded from the binary. Examples -------- @@ -238,7 +238,7 @@ def load(fin, encoding='utf-8', full_model=True): Returns ------- - Model + :class:`~gensim.models._fasttext_bin.Model` The loaded model. """ From 29c5210e1b4f8c89a0529b7f46f680aab27fef8d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 20:03:04 +1100 Subject: [PATCH 12/39] fix unit test --- gensim/test/test_fasttext.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 5437a9b3c8..25b7bf3615 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -695,7 +695,10 @@ def test_online_learning_after_save_fromfile(self): epochs=model_neg.epochs) self.assertEqual(len(model_neg.wv.vocab), 14) - def online_sanity(self, model): + def online_sanity(self, model, epochs=None): + if epochs is None: + epochs = model.epochs + terro, others = [], [] for l in list_corpus: if 'terrorism' in l: @@ -730,9 +733,9 @@ def test_sg_neg_online(self): @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_hs_online(self): model = FT_gensim( - sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1 + sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, seed=42, workers=1 ) - self.online_sanity(model) + self.online_sanity(model, epochs=1) @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_neg_online(self): From 044d699179ce7a6fac16615b860cff57830986c5 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 22:11:11 +1100 Subject: [PATCH 13/39] Revert "deprecate the iter parameter to the FastText constructor" This reverts commit 2e728cb4fd70519ffbe05c910d0de240d49e3e07. --- gensim/models/fasttext.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 135478a449..b928adcadc 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -353,7 +353,7 @@ class FastText(BaseWordEmbeddingsModel): """ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=None, null_word=0, min_n=3, max_n=6, + negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), compatible_hash=True): """ @@ -416,7 +416,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. iter : int, optional - Deprecated. + Number of iterations (epochs) over the corpus. trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). @@ -471,12 +471,6 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha >>> of_vector = model.wv['of'] # get vector for out-of-vocab word """ - if iter is not None: - logging.warn( - 'The iter parameter is deprecated. Pass the epochs keyword ' - 'parameter to the train method instead.' - ) - self.load = call_on_class_only self.load_fasttext_format = call_on_class_only self.callbacks = callbacks @@ -493,7 +487,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha self.wv.bucket = self.trainables.bucket super(FastText, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, + sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION) From f9df1366481bae4cd078958d590dd69a2ace9ed6 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Jan 2019 22:11:16 +1100 Subject: [PATCH 14/39] Revert "fix unit test" This reverts commit 29c5210e1b4f8c89a0529b7f46f680aab27fef8d. --- gensim/test/test_fasttext.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 25b7bf3615..5437a9b3c8 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -695,10 +695,7 @@ def test_online_learning_after_save_fromfile(self): epochs=model_neg.epochs) self.assertEqual(len(model_neg.wv.vocab), 14) - def online_sanity(self, model, epochs=None): - if epochs is None: - epochs = model.epochs - + def online_sanity(self, model): terro, others = [], [] for l in list_corpus: if 'terrorism' in l: @@ -733,9 +730,9 @@ def test_sg_neg_online(self): @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_hs_online(self): model = FT_gensim( - sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, seed=42, workers=1 + sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1 ) - self.online_sanity(model, epochs=1) + self.online_sanity(model) @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_neg_online(self): From cdc727a2e1b3f71b245b503f4e70e082d43701a7 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 12:58:46 +1100 Subject: [PATCH 15/39] more documentation improvements --- gensim/models/fasttext.py | 76 ++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b928adcadc..2246ab17d0 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -30,19 +30,60 @@ .. sourcecode:: pycon - >>> from gensim.test.utils import common_texts >>> from gensim.models import FastText - >>> - >>> model = FastText(common_texts, size=4, window=3, min_count=1, iter=10) - >>> sentences = [ - ... ['computer', 'artificial', 'intelligence'], - ... ['artificial', 'trees'], - ... ['human', 'intelligence'], - ... ['artificial', 'graph'], - ... ['intelligence'], - ... ['artificial', 'intelligence', 'system'] - ... ] - >>> model.train(sentences, total_examples=len(sentences), epochs=model.epochs) + >>> from gensim.test.utils import common_texts # some example sentences + >>> print(common_texts[0]) + ['human', 'interface', 'computer'] + >>> print(len(common_texts)) + 9 + >>> model = FastText(size=4, window=3, min_count=1) # instantiate + >>> model.build_vocab(sentences=common_texts) + >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train + +You can also pass all the above parameters to the constructor to do everything +in a single line: + +.. sourcecode:: pycon + + >>> model2 = FastText( + ... size=4, window=3, min_count=1, + ... sentences=common_texts, iter=10 + ... ) + +.. Important: + We intend to deprecate this second method of passing everything through the constructor. + The motivation is to simplify the API and resolve naming inconsistencies, + e.g. the iter parameter to the constructor is called epochs in the train function. + +The two models above are instantiated differently, but behave identically. +For example, we can compare the embeddings they've calculated for the word "computer": + + >>> import numpy as np + >>> np.allclose(model.wv['computer'], model2.wv['computer']) + True + +In the above examples, we trained the model from sentences (lists of words) loaded into memory. +This is OK for smaller datasets, but for larger datasets, we recommend streaming the file, +for example from disk or the network. +In Gensim, we refer to such datasets as "corpora" (singular "corpus"), and keep them +in the format described in :class:`~gensim.models.word2vec.LineSentence`. +Passing a corpus is simple: + +.. sourcode:: pycon + + >>> from gensim.test.utils import datapath + >>> corpus_file = datapath('lee_background.cor') # absolute path to corpus + >>> model3 = FastText(size=4, window=3, min_count=1) + >>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary + >>> total_examples = model.corpus_count # number of sentences in the corpus + >>> total_words = model.corpus_total_words # number of words in the corpus + >>> model3.train(corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=5) + +The model needs the `total_examples` and `total_words` parameters in order to +manage the training rate (alpha) correctly, and to give accurate progress estimates. +The above example relies on an implementation detail: the build_vocab method +sets the `corpus_count` and `corpus_total_words` model attributes. +You may calculate them by scanning over the corpus yourself, too. Persist a model to disk with: @@ -63,9 +104,12 @@ >>> import numpy as np >>> old_computer = np.copy(model.wv['computer']) # Grab the existing vector for this word >>> new_sentences = [ - ... ['computers', 'expensive'], - ... ['computer', 'chess', 'players', 'stronger', 'than', 'humans'], - ... ['computers', 'are', 'everywhere'], + ... ['computer', 'artificial', 'intelligence'], + ... ['artificial', 'trees'], + ... ['human', 'intelligence'], + ... ['artificial', 'graph'], + ... ['intelligence'], + ... ['artificial', 'intelligence', 'system'] ... ] >>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) >>> new_computer = model.wv['computer'] @@ -77,7 +121,6 @@ .. sourcecode:: pycon - >>> from gensim.test.utils import datapath >>> cap_path = datapath("crime-and-punishment.bin") >>> # Partial model: loads quickly, uses less RAM, but cannot continue training >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) @@ -92,7 +135,6 @@ >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab False >>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors - >>> fb_full.train(sentences, total_examples=len(sentences), epochs=model.epochs) >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) >>> new_computer = fb_full.wv['computer'] >>> # FIXME: why is this True?? From 4ea3f069cca1931ce720bfbd3166b9b8fd6076d1 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 13:12:08 +1100 Subject: [PATCH 16/39] comment out pesky import --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 2246ab17d0..ac46873314 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -30,7 +30,7 @@ .. sourcecode:: pycon - >>> from gensim.models import FastText + >>> # from gensim.models import FastText # FIXME: why does Sphinx dislike this import? >>> from gensim.test.utils import common_texts # some example sentences >>> print(common_texts[0]) ['human', 'interface', 'computer'] From e532d62f356dfc56c6cd9383a00129533622eade Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 13:19:53 +1100 Subject: [PATCH 17/39] fix typo --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index ac46873314..dba784e3a3 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -69,7 +69,7 @@ in the format described in :class:`~gensim.models.word2vec.LineSentence`. Passing a corpus is simple: -.. sourcode:: pycon +.. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> corpus_file = datapath('lee_background.cor') # absolute path to corpus From 931d3d7097606c1ea686e8072f403f2f792d731d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 14:29:16 +1100 Subject: [PATCH 18/39] improve tutorial notebook --- docs/notebooks/FastText_Tutorial.ipynb | 638 ++++++++++++------------- 1 file changed, 299 insertions(+), 339 deletions(-) diff --git a/docs/notebooks/FastText_Tutorial.ipynb b/docs/notebooks/FastText_Tutorial.ipynb index bc964b2829..ed2d4d522f 100644 --- a/docs/notebooks/FastText_Tutorial.ipynb +++ b/docs/notebooks/FastText_Tutorial.ipynb @@ -54,39 +54,31 @@ "execution_count": 1, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "FastText(vocab=1763, size=100, alpha=0.025)\n" + "FastText(vocab=1762, size=100, alpha=0.025)\n" ] } ], "source": [ - "import gensim\n", - "import os\n", - "from gensim.models.word2vec import LineSentence\n", "from gensim.models.fasttext import FastText as FT_gensim\n", + "from gensim.test.utils import datapath\n", "\n", "# Set file names for train and test data\n", - "data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data']) + os.sep\n", - "lee_train_file = data_dir + 'lee_background.cor'\n", - "lee_data = LineSentence(lee_train_file)\n", + "corpus_file = datapath('lee_background.cor')\n", "\n", "model_gensim = FT_gensim(size=100)\n", "\n", "# build the vocabulary\n", - "model_gensim.build_vocab(lee_data)\n", + "model_gensim.build_vocab(corpus_file=corpus_file)\n", "\n", "# train the model\n", - "model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)\n", + "model_gensim.train(\n", + " corpus_file=corpus_file, epochs=model_gensim.epochs,\n", + " total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words\n", + ")\n", "\n", "print(model_gensim)" ] @@ -115,10 +107,10 @@ "from gensim.models.wrappers.fasttext import FastText as FT_wrapper\n", "\n", "# Set FastText home to the path to the FastText executable\n", - "ft_home = '/home/chinmaya/GSOC/Gensim/fastText/fasttext'\n", + "ft_home = '/home/misha/src/fastText-0.1.0/fasttext'\n", "\n", "# train the model\n", - "model_wrapper = FT_wrapper.train(ft_home, lee_train_file)\n", + "model_wrapper = FT_wrapper.train(ft_home, corpus_file)\n", "\n", "print(model_wrapper)" ] @@ -160,7 +152,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Note:** As in the case of Word2Vec, you can continue to train your model while using Gensim's native implementation of fastText. However, continuation of training with fastText models while using the wrapper is not supported." + "**Note:** As in the case of Word2Vec, you can continue to train your model while using Gensim's native implementation of fastText." ] }, { @@ -186,7 +178,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "FastText(vocab=1763, size=100, alpha=0.025)\n", + "FastText(vocab=1762, size=100, alpha=0.025)\n", "FastText(vocab=1763, size=100, alpha=0.025)\n" ] } @@ -231,40 +223,40 @@ "text": [ "True\n", "False\n", - "[ 0.60971916 0.66131264 0.09225323 0.28898761 0.34161603 0.06163925\n", - " -0.10147806 -0.18834428 -0.26355353 0.46417126 0.20428349 0.08414238\n", - " -0.61960417 -0.2977576 -0.22102182 0.14144184 0.13698931 -0.24608244\n", - " -0.58096874 0.3039414 0.18766184 0.38110724 0.11518024 -0.75747257\n", - " -0.275776 -0.42740449 -0.00725944 -0.24556711 0.41061676 0.05050014\n", - " -0.71367824 0.05223881 -0.07810796 0.22933683 0.43850809 0.06360656\n", - " 0.43815458 0.11096461 0.29619065 0.38061273 0.26262566 -0.07368335\n", - " 0.33198604 -0.1431711 -0.04876067 -0.35243919 0.18561274 -0.70321769\n", - " -0.16492438 -0.28362423 0.08294757 0.49758917 -0.17844993 -0.02241638\n", - " 0.18489315 0.01197879 -0.22931916 0.45774016 -0.40240806 -0.16401663\n", - " -0.07500558 0.06775728 0.14273891 0.39902335 0.1906638 0.14533612\n", - " -0.70275193 -0.64343351 -0.18003808 0.45082757 -0.42847934 0.23554228\n", - " 0.03722449 -0.0726353 -0.20106563 -0.85182953 0.16529776 0.2167791\n", - " 0.01655668 -0.45087481 0.44368106 0.94318634 0.3191022 -0.78148538\n", - " 0.06931634 -0.02454508 -0.07709292 0.00889531 0.41768485 -0.4333123\n", - " 0.57354093 0.40387386 0.50435936 0.15307237 0.41140166 0.09306428\n", - " -0.6406759 -0.00130932 0.01818158 0.05408832]\n", - "[ 0.57120456 0.61710706 0.08425266 0.28013577 0.30789921 0.08454974\n", - " -0.05984595 -0.14644302 -0.23369177 0.42689164 0.18699257 0.09090185\n", - " -0.57885733 -0.28756606 -0.20198511 0.12675938 0.14102744 -0.22880791\n", - " -0.52516965 0.27686313 0.19865591 0.33872125 0.11230565 -0.74198454\n", - " -0.28486362 -0.40490177 -0.00606945 -0.18761727 0.40040097 0.06941447\n", - " -0.70890718 0.03646363 -0.0598574 0.19175974 0.4242314 0.05878129\n", - " 0.41432344 0.10394377 0.2668701 0.38148809 0.2761937 -0.06951485\n", - " 0.34113405 -0.12189032 -0.05861677 -0.33032765 0.16585448 -0.65862278\n", - " -0.18381383 -0.28438907 0.08867586 0.46635329 -0.18801565 -0.01610042\n", - " 0.1940661 0.03761584 -0.21442287 0.41826423 -0.38097134 -0.15111094\n", - " -0.08636253 0.07374192 0.12731727 0.40068088 0.18576843 0.13244282\n", - " -0.64814759 -0.62510144 -0.17045424 0.44949777 -0.39068545 0.19102012\n", - " 0.03177847 -0.06673145 -0.17997442 -0.81052922 0.15459165 0.21476634\n", - " -0.01961387 -0.43806009 0.40781115 0.88663652 0.29360816 -0.74157697\n", - " 0.04686275 -0.0396045 -0.06810026 0.00260469 0.40505417 -0.39977569\n", - " 0.5443192 0.38472273 0.48665705 0.12033045 0.40395209 0.10123577\n", - " -0.6243847 -0.02460667 0.00828873 0.04089492]\n" + "[ 0.8314139 0.61584824 -0.22241311 0.07523467 0.5152522 0.07724247\n", + " -0.13744526 0.05606242 -0.09502476 0.45655364 0.51096547 -0.13521144\n", + " -0.7620124 -0.4685431 -0.15228595 -0.03442579 0.20600994 -0.5080321\n", + " -0.6443741 0.605772 -0.30647403 0.41962707 0.06037483 -0.40195057\n", + " -0.11246474 -0.59829116 -0.32052496 -0.48515126 0.2997839 -0.20067295\n", + " -0.20996568 0.12522118 -0.0364657 0.62870216 0.5781912 -0.00992062\n", + " 0.51955134 -0.10997857 0.16197589 0.27111182 -0.06318171 -0.24831475\n", + " 0.09808698 -0.37751442 -0.13298641 -0.15047912 -0.01828656 -0.6400881\n", + " 0.28488973 -0.14948265 0.18325825 0.6458386 -0.00953633 0.13587084\n", + " -0.1961209 -0.42555386 -0.19528134 0.52414805 -0.30868796 -0.5202228\n", + " -0.10896837 0.06696089 0.44607309 0.37719652 0.08233636 0.24584875\n", + " -0.80979943 -0.30543917 -0.15849951 0.16166946 -0.36826986 -0.00906481\n", + " -0.14814071 -0.25263855 -0.41303173 -0.48292273 -0.05554645 -0.00310395\n", + " 0.21415223 -0.27768075 0.7148276 1.3367277 0.33960983 -0.47452113\n", + " 0.27783358 0.09962273 0.04856196 -0.23065457 0.19847827 -0.7086235\n", + " 0.2897328 0.08882508 0.47819164 -0.10128012 0.17164136 -0.08161731\n", + " -0.64568347 -0.04466937 0.04507336 0.4807562 ]\n", + "[ 0.7486652 0.5551642 -0.20113334 0.0694495 0.46116358 0.06881845\n", + " -0.12488337 0.05208117 -0.08345503 0.41118833 0.4612766 -0.12186286\n", + " -0.68638855 -0.4214572 -0.13843313 -0.03139759 0.18622552 -0.45825756\n", + " -0.57948387 0.54435897 -0.27771378 0.3789184 0.05383135 -0.36025965\n", + " -0.10304614 -0.53994924 -0.28970715 -0.43614468 0.26968622 -0.18174443\n", + " -0.19075763 0.11169459 -0.03211116 0.5669812 0.5213458 -0.01047292\n", + " 0.4683945 -0.09853561 0.14416309 0.2458799 -0.05680516 -0.22388494\n", + " 0.08682863 -0.34187067 -0.11945734 -0.1357073 -0.0152749 -0.5779147\n", + " 0.25770664 -0.13402262 0.16518788 0.5821273 -0.00866939 0.12256315\n", + " -0.17704405 -0.38423932 -0.1755833 0.47041836 -0.27653104 -0.46991062\n", + " -0.09599836 0.05943088 0.4017819 0.33958077 0.07508487 0.22090466\n", + " -0.72955 -0.2727049 -0.14109111 0.14624386 -0.33014265 -0.00984893\n", + " -0.13071296 -0.22914156 -0.37331858 -0.43644536 -0.05077597 -0.00315402\n", + " 0.19187897 -0.2513682 0.6448789 1.2039913 0.30247915 -0.4269294\n", + " 0.25062108 0.08874664 0.04146989 -0.20783317 0.17835104 -0.6382346\n", + " 0.26064712 0.08040012 0.43090543 -0.09168535 0.15238702 -0.07426675\n", + " -0.5815522 -0.03998712 0.04137334 0.4317176 ]\n" ] } ], @@ -286,25 +278,19 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'all ngrams for word axe absent from model'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Raises a KeyError since none of the character ngrams of the word `axe` are present in the training data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmodel_wrapper\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'axe'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/home/chinmaya/GSOC/Gensim/gensim/gensim/models/word2vec.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 1280\u001b[0m \u001b[0mRefer\u001b[0m \u001b[0mto\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdocumentation\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0;34m`\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mKeyedVectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1281\u001b[0m \"\"\"\n\u001b[0;32m-> 1282\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1284\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__contains__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/home/chinmaya/GSOC/Gensim/gensim/gensim/models/keyedvectors.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 589\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 590\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/home/chinmaya/GSOC/Gensim/gensim/gensim/models/wrappers/fasttext.pyc\u001b[0m in \u001b[0;36mword_vec\u001b[0;34m(self, word, use_norm)\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mword_vec\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mngrams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# No ngrams of the word are present in self.ngrams\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 94\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'all ngrams for word %s absent from model'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 95\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minit_sims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'all ngrams for word axe absent from model'" - ] - } - ], + "outputs": [], "source": [ "# Raises a KeyError since none of the character ngrams of the word `axe` are present in the training data\n", - "model_wrapper['axe']" + "try:\n", + " model_wrapper['axe']\n", + "except KeyError:\n", + " #\n", + " # trap the error here so it does not interfere\n", + " # with the execution of the cells below\n", + " #\n", + " pass\n", + "else:\n", + " assert False, 'the above code should have raised a KeyError'" ] }, { @@ -365,7 +351,7 @@ { "data": { "text/plain": [ - "0.9988949391617723" + "0.99999416" ] }, "execution_count": 7, @@ -401,16 +387,16 @@ { "data": { "text/plain": [ - "[(u'bowler', 0.9999216198921204),\n", - " (u'flights', 0.999881386756897),\n", - " (u'dozens', 0.9998700618743896),\n", - " (u'each', 0.9998670220375061),\n", - " (u'weather', 0.9998487234115601),\n", - " (u'technology', 0.999805748462677),\n", - " (u'acting', 0.9998006820678711),\n", - " (u'dollars', 0.999785840511322),\n", - " (u'place,', 0.9997731447219849),\n", - " (u'custody', 0.9997485280036926)]" + "[('night', 0.9999646544456482),\n", + " ('flights', 0.9999643564224243),\n", + " ('rights', 0.999963641166687),\n", + " ('night.', 0.9999594688415527),\n", + " ('quarter', 0.9999569654464722),\n", + " ('night,', 0.9999566078186035),\n", + " ('hearing', 0.9999553561210632),\n", + " ('better', 0.9999548196792603),\n", + " ('eight', 0.9999544620513916),\n", + " ('during', 0.999954342842102)]" ] }, "execution_count": 8, @@ -431,7 +417,7 @@ { "data": { "text/plain": [ - "0.99936318443348537" + "0.9999701" ] }, "execution_count": 9, @@ -451,7 +437,7 @@ { "data": { "text/plain": [ - "'dinner'" + "'cereal'" ] }, "execution_count": 10, @@ -471,16 +457,16 @@ { "data": { "text/plain": [ - "[(u'September', 0.9997114539146423),\n", - " (u'Rafter', 0.9996863007545471),\n", - " (u'New', 0.999636709690094),\n", - " (u'after', 0.9996317625045776),\n", - " (u'day', 0.9996190071105957),\n", - " (u'After', 0.9996107816696167),\n", - " (u'against', 0.9996088743209839),\n", - " (u'Robert', 0.9996023178100586),\n", - " (u'attacks', 0.9995726346969604),\n", - " (u'States', 0.9995641112327576)]" + "[('suicide', 0.9997773170471191),\n", + " ('decide', 0.9997694492340088),\n", + " ('side', 0.9997690916061401),\n", + " ('Minister', 0.9997668266296387),\n", + " ('inside', 0.9997666478157043),\n", + " ('Minister,', 0.99976646900177),\n", + " ('ministers', 0.9997649192810059),\n", + " ('Alliance', 0.9997645616531372),\n", + " ('best', 0.9997645020484924),\n", + " ('bombers', 0.9997643232345581)]" ] }, "execution_count": 11, @@ -497,242 +483,220 @@ "execution_count": 12, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "family: 0.0% (0/2)\n", - "gram3-comparative: 0.0% (0/12)\n", - "gram4-superlative: 0.0% (0/12)\n", - "gram5-present-participle: 0.0% (0/20)\n", - "gram6-nationality-adjective: 0.0% (0/20)\n", - "gram7-past-tense: 0.0% (0/20)\n", - "gram8-plural: 0.0% (0/12)\n", - "total: 0.0% (0/98)\n" - ] - }, { "data": { "text/plain": [ - "[{'correct': [], 'incorrect': [], 'section': u'capital-common-countries'},\n", - " {'correct': [], 'incorrect': [], 'section': u'capital-world'},\n", - " {'correct': [], 'incorrect': [], 'section': u'currency'},\n", - " {'correct': [], 'incorrect': [], 'section': u'city-in-state'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'HE', u'SHE', u'HIS', u'HER'),\n", - " (u'HIS', u'HER', u'HE', u'SHE')],\n", - " 'section': u'family'},\n", - " {'correct': [], 'incorrect': [], 'section': u'gram1-adjective-to-adverb'},\n", - " {'correct': [], 'incorrect': [], 'section': u'gram2-opposite'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'GOOD', u'BETTER', u'GREAT', u'GREATER'),\n", - " (u'GOOD', u'BETTER', u'LONG', u'LONGER'),\n", - " (u'GOOD', u'BETTER', u'LOW', u'LOWER'),\n", - " (u'GREAT', u'GREATER', u'LONG', u'LONGER'),\n", - " (u'GREAT', u'GREATER', u'LOW', u'LOWER'),\n", - " (u'GREAT', u'GREATER', u'GOOD', u'BETTER'),\n", - " (u'LONG', u'LONGER', u'LOW', u'LOWER'),\n", - " (u'LONG', u'LONGER', u'GOOD', u'BETTER'),\n", - " (u'LONG', u'LONGER', u'GREAT', u'GREATER'),\n", - " (u'LOW', u'LOWER', u'GOOD', u'BETTER'),\n", - " (u'LOW', u'LOWER', u'GREAT', u'GREATER'),\n", - " (u'LOW', u'LOWER', u'LONG', u'LONGER')],\n", - " 'section': u'gram3-comparative'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'BIG', u'BIGGEST', u'GOOD', u'BEST'),\n", - " (u'BIG', u'BIGGEST', u'GREAT', u'GREATEST'),\n", - " (u'BIG', u'BIGGEST', u'LARGE', u'LARGEST'),\n", - " (u'GOOD', u'BEST', u'GREAT', u'GREATEST'),\n", - " (u'GOOD', u'BEST', u'LARGE', u'LARGEST'),\n", - " (u'GOOD', u'BEST', u'BIG', u'BIGGEST'),\n", - " (u'GREAT', u'GREATEST', u'LARGE', u'LARGEST'),\n", - " (u'GREAT', u'GREATEST', u'BIG', u'BIGGEST'),\n", - " (u'GREAT', u'GREATEST', u'GOOD', u'BEST'),\n", - " (u'LARGE', u'LARGEST', u'BIG', u'BIGGEST'),\n", - " (u'LARGE', u'LARGEST', u'GOOD', u'BEST'),\n", - " (u'LARGE', u'LARGEST', u'GREAT', u'GREATEST')],\n", - " 'section': u'gram4-superlative'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'GO', u'GOING', u'LOOK', u'LOOKING'),\n", - " (u'GO', u'GOING', u'PLAY', u'PLAYING'),\n", - " (u'GO', u'GOING', u'RUN', u'RUNNING'),\n", - " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", - " (u'LOOK', u'LOOKING', u'PLAY', u'PLAYING'),\n", - " (u'LOOK', u'LOOKING', u'RUN', u'RUNNING'),\n", - " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", - " (u'LOOK', u'LOOKING', u'GO', u'GOING'),\n", - " (u'PLAY', u'PLAYING', u'RUN', u'RUNNING'),\n", - " (u'PLAY', u'PLAYING', u'SAY', u'SAYING'),\n", - " (u'PLAY', u'PLAYING', u'GO', u'GOING'),\n", - " (u'PLAY', u'PLAYING', u'LOOK', u'LOOKING'),\n", - " (u'RUN', u'RUNNING', u'SAY', u'SAYING'),\n", - " (u'RUN', u'RUNNING', u'GO', u'GOING'),\n", - " (u'RUN', u'RUNNING', u'LOOK', u'LOOKING'),\n", - " (u'RUN', u'RUNNING', u'PLAY', u'PLAYING'),\n", - " (u'SAY', u'SAYING', u'GO', u'GOING'),\n", - " (u'SAY', u'SAYING', u'LOOK', u'LOOKING'),\n", - " (u'SAY', u'SAYING', u'PLAY', u'PLAYING'),\n", - " (u'SAY', u'SAYING', u'RUN', u'RUNNING')],\n", - " 'section': u'gram5-present-participle'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'AUSTRALIA', u'AUSTRALIAN', u'FRANCE', u'FRENCH'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'INDIA', u'INDIAN'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'SWITZERLAND', u'SWISS'),\n", - " (u'FRANCE', u'FRENCH', u'INDIA', u'INDIAN'),\n", - " (u'FRANCE', u'FRENCH', u'ISRAEL', u'ISRAELI'),\n", - " (u'FRANCE', u'FRENCH', u'SWITZERLAND', u'SWISS'),\n", - " (u'FRANCE', u'FRENCH', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'INDIA', u'INDIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'INDIA', u'INDIAN', u'SWITZERLAND', u'SWISS'),\n", - " (u'INDIA', u'INDIAN', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'INDIA', u'INDIAN', u'FRANCE', u'FRENCH'),\n", - " (u'ISRAEL', u'ISRAELI', u'SWITZERLAND', u'SWISS'),\n", - " (u'ISRAEL', u'ISRAELI', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'ISRAEL', u'ISRAELI', u'FRANCE', u'FRENCH'),\n", - " (u'ISRAEL', u'ISRAELI', u'INDIA', u'INDIAN'),\n", - " (u'SWITZERLAND', u'SWISS', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'SWITZERLAND', u'SWISS', u'FRANCE', u'FRENCH'),\n", - " (u'SWITZERLAND', u'SWISS', u'INDIA', u'INDIAN'),\n", - " (u'SWITZERLAND', u'SWISS', u'ISRAEL', u'ISRAELI')],\n", - " 'section': u'gram6-nationality-adjective'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'GOING', u'WENT', u'PAYING', u'PAID'),\n", - " (u'GOING', u'WENT', u'PLAYING', u'PLAYED'),\n", - " (u'GOING', u'WENT', u'SAYING', u'SAID'),\n", - " (u'GOING', u'WENT', u'TAKING', u'TOOK'),\n", - " (u'PAYING', u'PAID', u'PLAYING', u'PLAYED'),\n", - " (u'PAYING', u'PAID', u'SAYING', u'SAID'),\n", - " (u'PAYING', u'PAID', u'TAKING', u'TOOK'),\n", - " (u'PAYING', u'PAID', u'GOING', u'WENT'),\n", - " (u'PLAYING', u'PLAYED', u'SAYING', u'SAID'),\n", - " (u'PLAYING', u'PLAYED', u'TAKING', u'TOOK'),\n", - " (u'PLAYING', u'PLAYED', u'GOING', u'WENT'),\n", - " (u'PLAYING', u'PLAYED', u'PAYING', u'PAID'),\n", - " (u'SAYING', u'SAID', u'TAKING', u'TOOK'),\n", - " (u'SAYING', u'SAID', u'GOING', u'WENT'),\n", - " (u'SAYING', u'SAID', u'PAYING', u'PAID'),\n", - " (u'SAYING', u'SAID', u'PLAYING', u'PLAYED'),\n", - " (u'TAKING', u'TOOK', u'GOING', u'WENT'),\n", - " (u'TAKING', u'TOOK', u'PAYING', u'PAID'),\n", - " (u'TAKING', u'TOOK', u'PLAYING', u'PLAYED'),\n", - " (u'TAKING', u'TOOK', u'SAYING', u'SAID')],\n", - " 'section': u'gram7-past-tense'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'BUILDING', u'BUILDINGS', u'CAR', u'CARS'),\n", - " (u'BUILDING', u'BUILDINGS', u'CHILD', u'CHILDREN'),\n", - " (u'BUILDING', u'BUILDINGS', u'MAN', u'MEN'),\n", - " (u'CAR', u'CARS', u'CHILD', u'CHILDREN'),\n", - " (u'CAR', u'CARS', u'MAN', u'MEN'),\n", - " (u'CAR', u'CARS', u'BUILDING', u'BUILDINGS'),\n", - " (u'CHILD', u'CHILDREN', u'MAN', u'MEN'),\n", - " (u'CHILD', u'CHILDREN', u'BUILDING', u'BUILDINGS'),\n", - " (u'CHILD', u'CHILDREN', u'CAR', u'CARS'),\n", - " (u'MAN', u'MEN', u'BUILDING', u'BUILDINGS'),\n", - " (u'MAN', u'MEN', u'CAR', u'CARS'),\n", - " (u'MAN', u'MEN', u'CHILD', u'CHILDREN')],\n", - " 'section': u'gram8-plural'},\n", - " {'correct': [], 'incorrect': [], 'section': u'gram9-plural-verbs'},\n", - " {'correct': [],\n", - " 'incorrect': [(u'HE', u'SHE', u'HIS', u'HER'),\n", - " (u'HIS', u'HER', u'HE', u'SHE'),\n", - " (u'GOOD', u'BETTER', u'GREAT', u'GREATER'),\n", - " (u'GOOD', u'BETTER', u'LONG', u'LONGER'),\n", - " (u'GOOD', u'BETTER', u'LOW', u'LOWER'),\n", - " (u'GREAT', u'GREATER', u'LONG', u'LONGER'),\n", - " (u'GREAT', u'GREATER', u'LOW', u'LOWER'),\n", - " (u'GREAT', u'GREATER', u'GOOD', u'BETTER'),\n", - " (u'LONG', u'LONGER', u'LOW', u'LOWER'),\n", - " (u'LONG', u'LONGER', u'GOOD', u'BETTER'),\n", - " (u'LONG', u'LONGER', u'GREAT', u'GREATER'),\n", - " (u'LOW', u'LOWER', u'GOOD', u'BETTER'),\n", - " (u'LOW', u'LOWER', u'GREAT', u'GREATER'),\n", - " (u'LOW', u'LOWER', u'LONG', u'LONGER'),\n", - " (u'BIG', u'BIGGEST', u'GOOD', u'BEST'),\n", - " (u'BIG', u'BIGGEST', u'GREAT', u'GREATEST'),\n", - " (u'BIG', u'BIGGEST', u'LARGE', u'LARGEST'),\n", - " (u'GOOD', u'BEST', u'GREAT', u'GREATEST'),\n", - " (u'GOOD', u'BEST', u'LARGE', u'LARGEST'),\n", - " (u'GOOD', u'BEST', u'BIG', u'BIGGEST'),\n", - " (u'GREAT', u'GREATEST', u'LARGE', u'LARGEST'),\n", - " (u'GREAT', u'GREATEST', u'BIG', u'BIGGEST'),\n", - " (u'GREAT', u'GREATEST', u'GOOD', u'BEST'),\n", - " (u'LARGE', u'LARGEST', u'BIG', u'BIGGEST'),\n", - " (u'LARGE', u'LARGEST', u'GOOD', u'BEST'),\n", - " (u'LARGE', u'LARGEST', u'GREAT', u'GREATEST'),\n", - " (u'GO', u'GOING', u'LOOK', u'LOOKING'),\n", - " (u'GO', u'GOING', u'PLAY', u'PLAYING'),\n", - " (u'GO', u'GOING', u'RUN', u'RUNNING'),\n", - " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", - " (u'LOOK', u'LOOKING', u'PLAY', u'PLAYING'),\n", - " (u'LOOK', u'LOOKING', u'RUN', u'RUNNING'),\n", - " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", - " (u'LOOK', u'LOOKING', u'GO', u'GOING'),\n", - " (u'PLAY', u'PLAYING', u'RUN', u'RUNNING'),\n", - " (u'PLAY', u'PLAYING', u'SAY', u'SAYING'),\n", - " (u'PLAY', u'PLAYING', u'GO', u'GOING'),\n", - " (u'PLAY', u'PLAYING', u'LOOK', u'LOOKING'),\n", - " (u'RUN', u'RUNNING', u'SAY', u'SAYING'),\n", - " (u'RUN', u'RUNNING', u'GO', u'GOING'),\n", - " (u'RUN', u'RUNNING', u'LOOK', u'LOOKING'),\n", - " (u'RUN', u'RUNNING', u'PLAY', u'PLAYING'),\n", - " (u'SAY', u'SAYING', u'GO', u'GOING'),\n", - " (u'SAY', u'SAYING', u'LOOK', u'LOOKING'),\n", - " (u'SAY', u'SAYING', u'PLAY', u'PLAYING'),\n", - " (u'SAY', u'SAYING', u'RUN', u'RUNNING'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'FRANCE', u'FRENCH'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'INDIA', u'INDIAN'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'SWITZERLAND', u'SWISS'),\n", - " (u'FRANCE', u'FRENCH', u'INDIA', u'INDIAN'),\n", - " (u'FRANCE', u'FRENCH', u'ISRAEL', u'ISRAELI'),\n", - " (u'FRANCE', u'FRENCH', u'SWITZERLAND', u'SWISS'),\n", - " (u'FRANCE', u'FRENCH', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'INDIA', u'INDIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'INDIA', u'INDIAN', u'SWITZERLAND', u'SWISS'),\n", - " (u'INDIA', u'INDIAN', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'INDIA', u'INDIAN', u'FRANCE', u'FRENCH'),\n", - " (u'ISRAEL', u'ISRAELI', u'SWITZERLAND', u'SWISS'),\n", - " (u'ISRAEL', u'ISRAELI', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'ISRAEL', u'ISRAELI', u'FRANCE', u'FRENCH'),\n", - " (u'ISRAEL', u'ISRAELI', u'INDIA', u'INDIAN'),\n", - " (u'SWITZERLAND', u'SWISS', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'SWITZERLAND', u'SWISS', u'FRANCE', u'FRENCH'),\n", - " (u'SWITZERLAND', u'SWISS', u'INDIA', u'INDIAN'),\n", - " (u'SWITZERLAND', u'SWISS', u'ISRAEL', u'ISRAELI'),\n", - " (u'GOING', u'WENT', u'PAYING', u'PAID'),\n", - " (u'GOING', u'WENT', u'PLAYING', u'PLAYED'),\n", - " (u'GOING', u'WENT', u'SAYING', u'SAID'),\n", - " (u'GOING', u'WENT', u'TAKING', u'TOOK'),\n", - " (u'PAYING', u'PAID', u'PLAYING', u'PLAYED'),\n", - " (u'PAYING', u'PAID', u'SAYING', u'SAID'),\n", - " (u'PAYING', u'PAID', u'TAKING', u'TOOK'),\n", - " (u'PAYING', u'PAID', u'GOING', u'WENT'),\n", - " (u'PLAYING', u'PLAYED', u'SAYING', u'SAID'),\n", - " (u'PLAYING', u'PLAYED', u'TAKING', u'TOOK'),\n", - " (u'PLAYING', u'PLAYED', u'GOING', u'WENT'),\n", - " (u'PLAYING', u'PLAYED', u'PAYING', u'PAID'),\n", - " (u'SAYING', u'SAID', u'TAKING', u'TOOK'),\n", - " (u'SAYING', u'SAID', u'GOING', u'WENT'),\n", - " (u'SAYING', u'SAID', u'PAYING', u'PAID'),\n", - " (u'SAYING', u'SAID', u'PLAYING', u'PLAYED'),\n", - " (u'TAKING', u'TOOK', u'GOING', u'WENT'),\n", - " (u'TAKING', u'TOOK', u'PAYING', u'PAID'),\n", - " (u'TAKING', u'TOOK', u'PLAYING', u'PLAYED'),\n", - " (u'TAKING', u'TOOK', u'SAYING', u'SAID'),\n", - " (u'BUILDING', u'BUILDINGS', u'CAR', u'CARS'),\n", - " (u'BUILDING', u'BUILDINGS', u'CHILD', u'CHILDREN'),\n", - " (u'BUILDING', u'BUILDINGS', u'MAN', u'MEN'),\n", - " (u'CAR', u'CARS', u'CHILD', u'CHILDREN'),\n", - " (u'CAR', u'CARS', u'MAN', u'MEN'),\n", - " (u'CAR', u'CARS', u'BUILDING', u'BUILDINGS'),\n", - " (u'CHILD', u'CHILDREN', u'MAN', u'MEN'),\n", - " (u'CHILD', u'CHILDREN', u'BUILDING', u'BUILDINGS'),\n", - " (u'CHILD', u'CHILDREN', u'CAR', u'CARS'),\n", - " (u'MAN', u'MEN', u'BUILDING', u'BUILDINGS'),\n", - " (u'MAN', u'MEN', u'CAR', u'CARS'),\n", - " (u'MAN', u'MEN', u'CHILD', u'CHILDREN')],\n", - " 'section': 'total'}]" + "[{'section': 'capital-common-countries', 'correct': [], 'incorrect': []},\n", + " {'section': 'capital-world', 'correct': [], 'incorrect': []},\n", + " {'section': 'currency', 'correct': [], 'incorrect': []},\n", + " {'section': 'city-in-state', 'correct': [], 'incorrect': []},\n", + " {'section': 'family',\n", + " 'correct': [],\n", + " 'incorrect': [('HE', 'SHE', 'HIS', 'HER'), ('HIS', 'HER', 'HE', 'SHE')]},\n", + " {'section': 'gram1-adjective-to-adverb', 'correct': [], 'incorrect': []},\n", + " {'section': 'gram2-opposite', 'correct': [], 'incorrect': []},\n", + " {'section': 'gram3-comparative',\n", + " 'correct': [('GREAT', 'GREATER', 'LOW', 'LOWER'),\n", + " ('LONG', 'LONGER', 'LOW', 'LOWER'),\n", + " ('LOW', 'LOWER', 'GREAT', 'GREATER')],\n", + " 'incorrect': [('GOOD', 'BETTER', 'GREAT', 'GREATER'),\n", + " ('GOOD', 'BETTER', 'LONG', 'LONGER'),\n", + " ('GOOD', 'BETTER', 'LOW', 'LOWER'),\n", + " ('GREAT', 'GREATER', 'LONG', 'LONGER'),\n", + " ('GREAT', 'GREATER', 'GOOD', 'BETTER'),\n", + " ('LONG', 'LONGER', 'GOOD', 'BETTER'),\n", + " ('LONG', 'LONGER', 'GREAT', 'GREATER'),\n", + " ('LOW', 'LOWER', 'GOOD', 'BETTER'),\n", + " ('LOW', 'LOWER', 'LONG', 'LONGER')]},\n", + " {'section': 'gram4-superlative',\n", + " 'correct': [('GOOD', 'BEST', 'GREAT', 'GREATEST'),\n", + " ('GOOD', 'BEST', 'LARGE', 'LARGEST'),\n", + " ('GOOD', 'BEST', 'BIG', 'BIGGEST'),\n", + " ('GREAT', 'GREATEST', 'BIG', 'BIGGEST'),\n", + " ('LARGE', 'LARGEST', 'BIG', 'BIGGEST'),\n", + " ('LARGE', 'LARGEST', 'GREAT', 'GREATEST')],\n", + " 'incorrect': [('BIG', 'BIGGEST', 'GOOD', 'BEST'),\n", + " ('BIG', 'BIGGEST', 'GREAT', 'GREATEST'),\n", + " ('BIG', 'BIGGEST', 'LARGE', 'LARGEST'),\n", + " ('GREAT', 'GREATEST', 'LARGE', 'LARGEST'),\n", + " ('GREAT', 'GREATEST', 'GOOD', 'BEST'),\n", + " ('LARGE', 'LARGEST', 'GOOD', 'BEST')]},\n", + " {'section': 'gram5-present-participle',\n", + " 'correct': [('GO', 'GOING', 'LOOK', 'LOOKING'),\n", + " ('PLAY', 'PLAYING', 'SAY', 'SAYING'),\n", + " ('PLAY', 'PLAYING', 'LOOK', 'LOOKING'),\n", + " ('SAY', 'SAYING', 'LOOK', 'LOOKING'),\n", + " ('SAY', 'SAYING', 'PLAY', 'PLAYING')],\n", + " 'incorrect': [('GO', 'GOING', 'PLAY', 'PLAYING'),\n", + " ('GO', 'GOING', 'RUN', 'RUNNING'),\n", + " ('GO', 'GOING', 'SAY', 'SAYING'),\n", + " ('LOOK', 'LOOKING', 'PLAY', 'PLAYING'),\n", + " ('LOOK', 'LOOKING', 'RUN', 'RUNNING'),\n", + " ('LOOK', 'LOOKING', 'SAY', 'SAYING'),\n", + " ('LOOK', 'LOOKING', 'GO', 'GOING'),\n", + " ('PLAY', 'PLAYING', 'RUN', 'RUNNING'),\n", + " ('PLAY', 'PLAYING', 'GO', 'GOING'),\n", + " ('RUN', 'RUNNING', 'SAY', 'SAYING'),\n", + " ('RUN', 'RUNNING', 'GO', 'GOING'),\n", + " ('RUN', 'RUNNING', 'LOOK', 'LOOKING'),\n", + " ('RUN', 'RUNNING', 'PLAY', 'PLAYING'),\n", + " ('SAY', 'SAYING', 'GO', 'GOING'),\n", + " ('SAY', 'SAYING', 'RUN', 'RUNNING')]},\n", + " {'section': 'gram6-nationality-adjective',\n", + " 'correct': [('AUSTRALIA', 'AUSTRALIAN', 'INDIA', 'INDIAN'),\n", + " ('AUSTRALIA', 'AUSTRALIAN', 'ISRAEL', 'ISRAELI'),\n", + " ('INDIA', 'INDIAN', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('ISRAEL', 'ISRAELI', 'INDIA', 'INDIAN'),\n", + " ('SWITZERLAND', 'SWISS', 'INDIA', 'INDIAN')],\n", + " 'incorrect': [('AUSTRALIA', 'AUSTRALIAN', 'FRANCE', 'FRENCH'),\n", + " ('AUSTRALIA', 'AUSTRALIAN', 'SWITZERLAND', 'SWISS'),\n", + " ('FRANCE', 'FRENCH', 'INDIA', 'INDIAN'),\n", + " ('FRANCE', 'FRENCH', 'ISRAEL', 'ISRAELI'),\n", + " ('FRANCE', 'FRENCH', 'SWITZERLAND', 'SWISS'),\n", + " ('FRANCE', 'FRENCH', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('INDIA', 'INDIAN', 'ISRAEL', 'ISRAELI'),\n", + " ('INDIA', 'INDIAN', 'SWITZERLAND', 'SWISS'),\n", + " ('INDIA', 'INDIAN', 'FRANCE', 'FRENCH'),\n", + " ('ISRAEL', 'ISRAELI', 'SWITZERLAND', 'SWISS'),\n", + " ('ISRAEL', 'ISRAELI', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('ISRAEL', 'ISRAELI', 'FRANCE', 'FRENCH'),\n", + " ('SWITZERLAND', 'SWISS', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('SWITZERLAND', 'SWISS', 'FRANCE', 'FRENCH'),\n", + " ('SWITZERLAND', 'SWISS', 'ISRAEL', 'ISRAELI')]},\n", + " {'section': 'gram7-past-tense',\n", + " 'correct': [('PAYING', 'PAID', 'SAYING', 'SAID')],\n", + " 'incorrect': [('GOING', 'WENT', 'PAYING', 'PAID'),\n", + " ('GOING', 'WENT', 'PLAYING', 'PLAYED'),\n", + " ('GOING', 'WENT', 'SAYING', 'SAID'),\n", + " ('GOING', 'WENT', 'TAKING', 'TOOK'),\n", + " ('PAYING', 'PAID', 'PLAYING', 'PLAYED'),\n", + " ('PAYING', 'PAID', 'TAKING', 'TOOK'),\n", + " ('PAYING', 'PAID', 'GOING', 'WENT'),\n", + " ('PLAYING', 'PLAYED', 'SAYING', 'SAID'),\n", + " ('PLAYING', 'PLAYED', 'TAKING', 'TOOK'),\n", + " ('PLAYING', 'PLAYED', 'GOING', 'WENT'),\n", + " ('PLAYING', 'PLAYED', 'PAYING', 'PAID'),\n", + " ('SAYING', 'SAID', 'TAKING', 'TOOK'),\n", + " ('SAYING', 'SAID', 'GOING', 'WENT'),\n", + " ('SAYING', 'SAID', 'PAYING', 'PAID'),\n", + " ('SAYING', 'SAID', 'PLAYING', 'PLAYED'),\n", + " ('TAKING', 'TOOK', 'GOING', 'WENT'),\n", + " ('TAKING', 'TOOK', 'PAYING', 'PAID'),\n", + " ('TAKING', 'TOOK', 'PLAYING', 'PLAYED'),\n", + " ('TAKING', 'TOOK', 'SAYING', 'SAID')]},\n", + " {'section': 'gram8-plural',\n", + " 'correct': [('BUILDING', 'BUILDINGS', 'CHILD', 'CHILDREN'),\n", + " ('CHILD', 'CHILDREN', 'CAR', 'CARS'),\n", + " ('MAN', 'MEN', 'CAR', 'CARS')],\n", + " 'incorrect': [('BUILDING', 'BUILDINGS', 'CAR', 'CARS'),\n", + " ('BUILDING', 'BUILDINGS', 'MAN', 'MEN'),\n", + " ('CAR', 'CARS', 'CHILD', 'CHILDREN'),\n", + " ('CAR', 'CARS', 'MAN', 'MEN'),\n", + " ('CAR', 'CARS', 'BUILDING', 'BUILDINGS'),\n", + " ('CHILD', 'CHILDREN', 'MAN', 'MEN'),\n", + " ('CHILD', 'CHILDREN', 'BUILDING', 'BUILDINGS'),\n", + " ('MAN', 'MEN', 'BUILDING', 'BUILDINGS'),\n", + " ('MAN', 'MEN', 'CHILD', 'CHILDREN')]},\n", + " {'section': 'gram9-plural-verbs', 'correct': [], 'incorrect': []},\n", + " {'section': 'total',\n", + " 'correct': [('GREAT', 'GREATER', 'LOW', 'LOWER'),\n", + " ('LONG', 'LONGER', 'LOW', 'LOWER'),\n", + " ('LOW', 'LOWER', 'GREAT', 'GREATER'),\n", + " ('GOOD', 'BEST', 'GREAT', 'GREATEST'),\n", + " ('GOOD', 'BEST', 'LARGE', 'LARGEST'),\n", + " ('GOOD', 'BEST', 'BIG', 'BIGGEST'),\n", + " ('GREAT', 'GREATEST', 'BIG', 'BIGGEST'),\n", + " ('LARGE', 'LARGEST', 'BIG', 'BIGGEST'),\n", + " ('LARGE', 'LARGEST', 'GREAT', 'GREATEST'),\n", + " ('GO', 'GOING', 'LOOK', 'LOOKING'),\n", + " ('PLAY', 'PLAYING', 'SAY', 'SAYING'),\n", + " ('PLAY', 'PLAYING', 'LOOK', 'LOOKING'),\n", + " ('SAY', 'SAYING', 'LOOK', 'LOOKING'),\n", + " ('SAY', 'SAYING', 'PLAY', 'PLAYING'),\n", + " ('AUSTRALIA', 'AUSTRALIAN', 'INDIA', 'INDIAN'),\n", + " ('AUSTRALIA', 'AUSTRALIAN', 'ISRAEL', 'ISRAELI'),\n", + " ('INDIA', 'INDIAN', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('ISRAEL', 'ISRAELI', 'INDIA', 'INDIAN'),\n", + " ('SWITZERLAND', 'SWISS', 'INDIA', 'INDIAN'),\n", + " ('PAYING', 'PAID', 'SAYING', 'SAID'),\n", + " ('BUILDING', 'BUILDINGS', 'CHILD', 'CHILDREN'),\n", + " ('CHILD', 'CHILDREN', 'CAR', 'CARS'),\n", + " ('MAN', 'MEN', 'CAR', 'CARS')],\n", + " 'incorrect': [('HE', 'SHE', 'HIS', 'HER'),\n", + " ('HIS', 'HER', 'HE', 'SHE'),\n", + " ('GOOD', 'BETTER', 'GREAT', 'GREATER'),\n", + " ('GOOD', 'BETTER', 'LONG', 'LONGER'),\n", + " ('GOOD', 'BETTER', 'LOW', 'LOWER'),\n", + " ('GREAT', 'GREATER', 'LONG', 'LONGER'),\n", + " ('GREAT', 'GREATER', 'GOOD', 'BETTER'),\n", + " ('LONG', 'LONGER', 'GOOD', 'BETTER'),\n", + " ('LONG', 'LONGER', 'GREAT', 'GREATER'),\n", + " ('LOW', 'LOWER', 'GOOD', 'BETTER'),\n", + " ('LOW', 'LOWER', 'LONG', 'LONGER'),\n", + " ('BIG', 'BIGGEST', 'GOOD', 'BEST'),\n", + " ('BIG', 'BIGGEST', 'GREAT', 'GREATEST'),\n", + " ('BIG', 'BIGGEST', 'LARGE', 'LARGEST'),\n", + " ('GREAT', 'GREATEST', 'LARGE', 'LARGEST'),\n", + " ('GREAT', 'GREATEST', 'GOOD', 'BEST'),\n", + " ('LARGE', 'LARGEST', 'GOOD', 'BEST'),\n", + " ('GO', 'GOING', 'PLAY', 'PLAYING'),\n", + " ('GO', 'GOING', 'RUN', 'RUNNING'),\n", + " ('GO', 'GOING', 'SAY', 'SAYING'),\n", + " ('LOOK', 'LOOKING', 'PLAY', 'PLAYING'),\n", + " ('LOOK', 'LOOKING', 'RUN', 'RUNNING'),\n", + " ('LOOK', 'LOOKING', 'SAY', 'SAYING'),\n", + " ('LOOK', 'LOOKING', 'GO', 'GOING'),\n", + " ('PLAY', 'PLAYING', 'RUN', 'RUNNING'),\n", + " ('PLAY', 'PLAYING', 'GO', 'GOING'),\n", + " ('RUN', 'RUNNING', 'SAY', 'SAYING'),\n", + " ('RUN', 'RUNNING', 'GO', 'GOING'),\n", + " ('RUN', 'RUNNING', 'LOOK', 'LOOKING'),\n", + " ('RUN', 'RUNNING', 'PLAY', 'PLAYING'),\n", + " ('SAY', 'SAYING', 'GO', 'GOING'),\n", + " ('SAY', 'SAYING', 'RUN', 'RUNNING'),\n", + " ('AUSTRALIA', 'AUSTRALIAN', 'FRANCE', 'FRENCH'),\n", + " ('AUSTRALIA', 'AUSTRALIAN', 'SWITZERLAND', 'SWISS'),\n", + " ('FRANCE', 'FRENCH', 'INDIA', 'INDIAN'),\n", + " ('FRANCE', 'FRENCH', 'ISRAEL', 'ISRAELI'),\n", + " ('FRANCE', 'FRENCH', 'SWITZERLAND', 'SWISS'),\n", + " ('FRANCE', 'FRENCH', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('INDIA', 'INDIAN', 'ISRAEL', 'ISRAELI'),\n", + " ('INDIA', 'INDIAN', 'SWITZERLAND', 'SWISS'),\n", + " ('INDIA', 'INDIAN', 'FRANCE', 'FRENCH'),\n", + " ('ISRAEL', 'ISRAELI', 'SWITZERLAND', 'SWISS'),\n", + " ('ISRAEL', 'ISRAELI', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('ISRAEL', 'ISRAELI', 'FRANCE', 'FRENCH'),\n", + " ('SWITZERLAND', 'SWISS', 'AUSTRALIA', 'AUSTRALIAN'),\n", + " ('SWITZERLAND', 'SWISS', 'FRANCE', 'FRENCH'),\n", + " ('SWITZERLAND', 'SWISS', 'ISRAEL', 'ISRAELI'),\n", + " ('GOING', 'WENT', 'PAYING', 'PAID'),\n", + " ('GOING', 'WENT', 'PLAYING', 'PLAYED'),\n", + " ('GOING', 'WENT', 'SAYING', 'SAID'),\n", + " ('GOING', 'WENT', 'TAKING', 'TOOK'),\n", + " ('PAYING', 'PAID', 'PLAYING', 'PLAYED'),\n", + " ('PAYING', 'PAID', 'TAKING', 'TOOK'),\n", + " ('PAYING', 'PAID', 'GOING', 'WENT'),\n", + " ('PLAYING', 'PLAYED', 'SAYING', 'SAID'),\n", + " ('PLAYING', 'PLAYED', 'TAKING', 'TOOK'),\n", + " ('PLAYING', 'PLAYED', 'GOING', 'WENT'),\n", + " ('PLAYING', 'PLAYED', 'PAYING', 'PAID'),\n", + " ('SAYING', 'SAID', 'TAKING', 'TOOK'),\n", + " ('SAYING', 'SAID', 'GOING', 'WENT'),\n", + " ('SAYING', 'SAID', 'PAYING', 'PAID'),\n", + " ('SAYING', 'SAID', 'PLAYING', 'PLAYED'),\n", + " ('TAKING', 'TOOK', 'GOING', 'WENT'),\n", + " ('TAKING', 'TOOK', 'PAYING', 'PAID'),\n", + " ('TAKING', 'TOOK', 'PLAYING', 'PLAYED'),\n", + " ('TAKING', 'TOOK', 'SAYING', 'SAID'),\n", + " ('BUILDING', 'BUILDINGS', 'CAR', 'CARS'),\n", + " ('BUILDING', 'BUILDINGS', 'MAN', 'MEN'),\n", + " ('CAR', 'CARS', 'CHILD', 'CHILDREN'),\n", + " ('CAR', 'CARS', 'MAN', 'MEN'),\n", + " ('CAR', 'CARS', 'BUILDING', 'BUILDINGS'),\n", + " ('CHILD', 'CHILDREN', 'MAN', 'MEN'),\n", + " ('CHILD', 'CHILDREN', 'BUILDING', 'BUILDINGS'),\n", + " ('MAN', 'MEN', 'BUILDING', 'BUILDINGS'),\n", + " ('MAN', 'MEN', 'CHILD', 'CHILDREN')]}]" ] }, "execution_count": 12, @@ -741,9 +705,7 @@ } ], "source": [ - "question_file_path = data_dir + 'questions-words.txt'\n", - "\n", - "model_wrapper.accuracy(questions=question_file_path)" + "model_wrapper.accuracy(questions=datapath('questions-words.txt'))" ] }, { @@ -754,7 +716,7 @@ { "data": { "text/plain": [ - "1.1102867164706653" + "1.1245153746934533" ] }, "execution_count": 13, @@ -781,9 +743,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -791,21 +751,21 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" + "pygments_lexer": "ipython3", + "version": "3.7.1" } }, "nbformat": 4, From 177c712fff2ab8c17e090c46f2114cfa1624a4b9 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 15:26:33 +1100 Subject: [PATCH 19/39] minor documentation update --- gensim/models/fasttext.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index dba784e3a3..b9954479ca 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -40,6 +40,10 @@ >>> model.build_vocab(sentences=common_texts) >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train +Once you have a model, you can access its keyed vectors via the `model.wv` attributes. +The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks. +For a full list of examples, see :class:`~gensim.models.keyedvectors.FastTextKeyedVectors`. + You can also pass all the above parameters to the constructor to do everything in a single line: @@ -50,7 +54,7 @@ ... sentences=common_texts, iter=10 ... ) -.. Important: +.. Important:: We intend to deprecate this second method of passing everything through the constructor. The motivation is to simplify the API and resolve naming inconsistencies, e.g. the iter parameter to the constructor is called epochs in the train function. @@ -58,10 +62,13 @@ The two models above are instantiated differently, but behave identically. For example, we can compare the embeddings they've calculated for the word "computer": +.. sourcecode:: pycon + >>> import numpy as np >>> np.allclose(model.wv['computer'], model2.wv['computer']) True + In the above examples, we trained the model from sentences (lists of words) loaded into memory. This is OK for smaller datasets, but for larger datasets, we recommend streaming the file, for example from disk or the network. @@ -75,16 +82,34 @@ >>> corpus_file = datapath('lee_background.cor') # absolute path to corpus >>> model3 = FastText(size=4, window=3, min_count=1) >>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary - >>> total_examples = model.corpus_count # number of sentences in the corpus - >>> total_words = model.corpus_total_words # number of words in the corpus + >>> total_examples = model3.corpus_count # number of sentences in the corpus + >>> total_words = model3.corpus_total_words # number of words in the corpus >>> model3.train(corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=5) The model needs the `total_examples` and `total_words` parameters in order to manage the training rate (alpha) correctly, and to give accurate progress estimates. -The above example relies on an implementation detail: the build_vocab method +The above example relies on an implementation detail: the +:meth:`~gensim.models.fasttext.FastText.build_vocab` method sets the `corpus_count` and `corpus_total_words` model attributes. You may calculate them by scanning over the corpus yourself, too. +If you have a corpus in a different format, then you can use it by wrapping it +in an `iterator `_. +Your iterator should yield a list of strings each time. +Gensim will take care of the rest: + +.. sourcecode:: pycon + + >>> class MyIter: + ... def __iter__(self): + ... with open(datapath('crime-and-punishment.txt')) as fin: + ... for line in fin: + ... yield line.lower().strip().split(" ") + >>> model4 = FastText(size=4, window=3, min_count=1) + >>> model4.build_vocab(sentences=MyIter()) + >>> total_examples = model4.corpus_count + >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5) + Persist a model to disk with: .. sourcecode:: pycon From bd83886435d2e2d76317e76cc88188cfde887382 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 15:45:29 +1100 Subject: [PATCH 20/39] flake8-docs --- gensim/models/fasttext.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b9954479ca..b4875a9bca 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -105,6 +105,8 @@ ... with open(datapath('crime-and-punishment.txt')) as fin: ... for line in fin: ... yield line.lower().strip().split(" ") + >>> + >>> >>> model4 = FastText(size=4, window=3, min_count=1) >>> model4.build_vocab(sentences=MyIter()) >>> total_examples = model4.corpus_count From 11fabca71b4487dc47a4058c552c6f3356c9ece1 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 16:05:28 +1100 Subject: [PATCH 21/39] more doco fixes --- gensim/models/fasttext.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b4875a9bca..9f38aedb14 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -131,19 +131,25 @@ >>> import numpy as np >>> old_computer = np.copy(model.wv['computer']) # Grab the existing vector for this word >>> new_sentences = [ - ... ['computer', 'artificial', 'intelligence'], - ... ['artificial', 'trees'], - ... ['human', 'intelligence'], - ... ['artificial', 'graph'], - ... ['intelligence'], - ... ['artificial', 'intelligence', 'system'] + ... ['computer', 'aided', 'design'], + ... ['computer', 'science'], + ... ['computational', 'complexity'], + ... ['military', 'supercomputer'], + ... ['central', 'processing', 'unit'], + ... ['onboard', 'car', 'computer'], ... ] + >>> model.build_vocab(new_sentences, update=True) # Update the vocabulary >>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) >>> new_computer = model.wv['computer'] >>> # FIXME: why is this True?? >>> np.allclose(old_computer, new_computer, atol=1e-4) False +.. Important:: + Be sure to call the :meth:`~gensim.models.fasttext.FastText.build_vocab` + method before the :meth:`~gensim.models.fasttext.FastText.train` method + when continuing training. + You can also load models trained with Facebook's fastText implementation: .. sourcecode:: pycon @@ -162,11 +168,13 @@ >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab False >>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors + >>> fb_full.build_vocab(new_sentences, update=True) >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) >>> new_computer = fb_full.wv['computer'] - >>> # FIXME: why is this True?? >>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something False + >>> 'computer' in fb_full.wv.vocab # New word is now in the vocabulary + True Retrieve word-vector for vocab and out-of-vocab word: From 2d490f027af5a0238974a679df62bde341fe4682 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 16:33:04 +1100 Subject: [PATCH 22/39] fix example --- gensim/models/fasttext.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 9f38aedb14..ad850607ad 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -129,7 +129,9 @@ .. sourcecode:: pycon >>> import numpy as np - >>> old_computer = np.copy(model.wv['computer']) # Grab the existing vector for this word + >>> 'computation' in model.wv.vocab # New word, currently out of vocab + False + >>> old_vector = np.copy(model.wv['computation']) # Grab the existing vector >>> new_sentences = [ ... ['computer', 'aided', 'design'], ... ['computer', 'science'], @@ -140,9 +142,10 @@ ... ] >>> model.build_vocab(new_sentences, update=True) # Update the vocabulary >>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) - >>> new_computer = model.wv['computer'] - >>> # FIXME: why is this True?? - >>> np.allclose(old_computer, new_computer, atol=1e-4) + >>> new_vector = model.wv['computation'] + >>> np.allclose(old_vector, new_vector, atol=1e-4) # Vector has changed, model has learnt something + False + >>> 'computation' in model.wv.vocab # Word is still out of vocab False .. Important:: From 9b6f8bb0526951747e082ba4e9a08f003e0d81c4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 16:41:03 +1100 Subject: [PATCH 23/39] git rm docs/fasttext-notes.md --- docs/fasttext-notes.md | 152 ----------------------------------------- 1 file changed, 152 deletions(-) delete mode 100644 docs/fasttext-notes.md diff --git a/docs/fasttext-notes.md b/docs/fasttext-notes.md deleted file mode 100644 index 5b11b7de6a..0000000000 --- a/docs/fasttext-notes.md +++ /dev/null @@ -1,152 +0,0 @@ -FastText Notes -============== - -The implementation is split across several submodules: - -- models.fasttext -- models.keyedvectors (includes FastText-specific code, not good) -- models.word2vec (superclasses) -- models.base_any2vec (superclasses) - -The implementation consists of several key classes: - -1. models.fasttext.FastTextVocab: the vocabulary -2. models.keyedvectors.FastTextKeyedVectors: the vectors -3. models.fasttext.FastTextTrainables: the underlying neural network -4. models.fasttext.FastText: ties everything together - -FastTextVocab -------------- - -Seems to be an entirely redundant class. -Inherits from models.word2vec.Word2VecVocab, adding no new functionality. - -FastTextKeyedVectors --------------------- - -Inheritance hierarchy: - -1. FastTextKeyedVectors -2. WordEmbeddingsKeyedVectors. Implements word similarity e.g. cosine similarity, WMD, etc. -3. BaseKeyedVectors (abstract base class) -4. utils.SaveLoad - -There are many attributes. - -Inherited from BaseKeyedVectors: - -- vectors: a 2D numpy array. Flexible number of rows (0 by default). Number of columns equals vector dimensionality. -- vocab: a dictionary. Keys are words. Items are Vocab instances: these are essentially namedtuples that contain an index and a count. The former is the index of a term in the entire vocab. The latter is the number of times the term occurs. -- vector_size (dimensionality) -- index2entity - -Inherited from WordEmbeddingsKeyedVectors: - -- vectors_norm -- index2word - -Added by FastTextKeyedVectors: - -- vectors_vocab: 2D array. Rows are vectors. Columns correspond to vector dimensions. Initialized in FastTextTrainables.init_ngrams_weights. Reset in reset_ngrams_weights. Referred to as syn0_vocab in fasttext_inner.pyx. These are vectors for every word in the vocabulary. -- vectors_vocab_norm: looks unused, see _clear_post_train method. -- vectors_ngrams: 2D array. Each row is a bucket. Columns correspond to vector dimensions. Initialized in init_ngrams_weights function. Initialized in _load_vectors method when reading from native FB binary. Modified in reset_ngrams_weights method. This is the first matrix loaded from the native binary files. -- vectors_ngrams_norm: looks unused, see _clear_post_train method. -- buckets_word: A hashmap. Keyed by the index of a term in the vocab. Each value is an array, where each element is an integer that corresponds to a bucket. Initialized in init_ngrams_weights function -- hash2index: A hashmap. Keys are hashes of ngrams. Values are the number of ngrams (?). Initialized in init_ngrams_weights function. -- min_n: minimum ngram length -- max_n: maximum ngram length -- num_ngram_vectors: initialized in the init_ngrams_weights function - -The init_ngrams_method looks like an internal method of FastTextTrainables. -It gets called as part of the prepare_weights method, which is effectively part of the FastModel constructor. - -The above attributes are initialized to None in the FastTextKeyedVectors class constructor. -Unfortunately, their real initialization happens in an entirely different module, models.fasttext - another indication of poor separation of concerns. - -Some questions: - -- What is the x_lockf stuff? Why is it used only by the fast C implementation? -- How are vectors_vocab and vectors_ngrams different? - -vectors_vocab contains vectors for entire vocabulary. -vectors_ngrams contains vectors for each _bucket_. - - -FastTextTrainables ------------------- - -[Link](https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastTextTrainables) - -This is a neural network that learns the vectors for the FastText embedding. -Mostly inherits from its [Word2Vec parent](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2VecTrainables). -Adds logic for calculating and maintaining ngram weights. - -Key attributes: - -- hashfxn: function for randomly initializing weights. Defaults to the built-in hash() -- layer1_size: The size of the inner layer of the NN. Equal to the vector dimensionality. Set in the Word2VecTrainables constructor. -- seed: The random generator seed used in reset_weights and update_weights -- syn1: The inner layer of the NN. Each row corresponds to a term in the vocabulary. Columns correspond to weights of the inner layer. There are layer1_size such weights. Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. -- syn1neg: Similar to syn1, but only set if negative sampling is used. -- vectors_lockf: A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones. -- vectors_vocab_lockf: Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL) -- vectors_ngrams_lockf = ones((self.bucket, wv.vector_size), dtype=REAL) - -The lockf stuff looks like it gets used by the fast C implementation. - -The inheritance hierarchy here is: - -1. FastTextTrainables -2. Word2VecTrainables -3. utils.SaveLoad - -FastText --------- - -Inheritance hierarchy: - -1. FastText -2. BaseWordEmbeddingsModel: vocabulary management plus a ton of deprecated attrs -3. BaseAny2VecModel: logging and training functionality -4. utils.SaveLoad: for loading and saving - -Lots of attributes (many inherited from superclasses). - -From BaseAny2VecModel: - -- workers -- vector_size -- epochs -- callbacks -- batch_words -- kv -- vocabulary -- trainables - -From BaseWordEmbeddingModel: - -- alpha -- min_alpha -- min_alpha_yet_reached -- window -- random -- hs -- negative -- ns_exponent -- cbow_mean -- compute_loss -- running_training_loss -- corpus_count -- corpus_total_words -- neg_labels - -FastText attributes: - -- wv: FastTextWordVectors. Used instead of .kv - -Logging -------- - -The logging seems to be inheritance-based. -It may be better to refactor this using aggregation istead of inheritance in the future. -The benefits would be leaner classes with less responsibilities and better separation of concerns. From 9b5e1611f1569ea5984678dbd8aeb940f63511be Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 19:24:12 +1100 Subject: [PATCH 24/39] review response: include _fasttext_bin in docs --- docs/src/apiref.rst | 1 + docs/src/models/_fasttext_bin.rst | 10 ++++++++++ gensim/models/_fasttext_bin.py | 5 +++-- 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 docs/src/models/_fasttext_bin.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index c4f31f7f28..ae345e22bd 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -47,6 +47,7 @@ Modules: models/keyedvectors models/doc2vec models/fasttext + models/_fasttext_bin models/phrases models/poincare models/coherencemodel diff --git a/docs/src/models/_fasttext_bin.rst b/docs/src/models/_fasttext_bin.rst new file mode 100644 index 0000000000..eb9a0ad950 --- /dev/null +++ b/docs/src/models/_fasttext_bin.rst @@ -0,0 +1,10 @@ +:mod:`models._fasttext_bin` -- Facebook I/O +=========================================== + +.. automodule:: gensim.models._fasttext_bin + :synopsis: Facebook I/O + :members: + :inherited-members: + :special-members: __getitem__ + :undoc-members: + :show-inheritance: diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 713aafebfc..2dae20d101 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -8,6 +8,7 @@ -------- Load a model from a binary file: + .. sourcecode:: pycon >>> from gensim.test.utils import datapath @@ -84,8 +85,8 @@ def _yield_field_names(): Model = collections.namedtuple('Model', _FIELD_NAMES) """Holds data loaded from the Facebook binary. -Fields ------- +Parameters +---------- dim : int The dimensionality of the vectors. ws : int From 6aa013af3485574f02111d7a1d75a770f70d1b72 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 19:28:00 +1100 Subject: [PATCH 25/39] review response: make examples more readable --- gensim/models/fasttext.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index ad850607ad..1dc54be39e 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -32,6 +32,7 @@ >>> # from gensim.models import FastText # FIXME: why does Sphinx dislike this import? >>> from gensim.test.utils import common_texts # some example sentences + >>> >>> print(common_texts[0]) ['human', 'interface', 'computer'] >>> print(len(common_texts)) @@ -49,10 +50,7 @@ .. sourcecode:: pycon - >>> model2 = FastText( - ... size=4, window=3, min_count=1, - ... sentences=common_texts, iter=10 - ... ) + >>> model2 = FastText(size=4, window=3, min_count=1, sentences=common_texts, iter=10) .. Important:: We intend to deprecate this second method of passing everything through the constructor. @@ -65,6 +63,7 @@ .. sourcecode:: pycon >>> import numpy as np + >>> >>> np.allclose(model.wv['computer'], model2.wv['computer']) True @@ -79,9 +78,11 @@ .. sourcecode:: pycon >>> from gensim.test.utils import datapath + >>> >>> corpus_file = datapath('lee_background.cor') # absolute path to corpus >>> model3 = FastText(size=4, window=3, min_count=1) >>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary + >>> >>> total_examples = model3.corpus_count # number of sentences in the corpus >>> total_words = model3.corpus_total_words # number of words in the corpus >>> model3.train(corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=5) @@ -129,6 +130,7 @@ .. sourcecode:: pycon >>> import numpy as np + >>> >>> 'computation' in model.wv.vocab # New word, currently out of vocab False >>> old_vector = np.copy(model.wv['computation']) # Grab the existing vector @@ -140,8 +142,10 @@ ... ['central', 'processing', 'unit'], ... ['onboard', 'car', 'computer'], ... ] + >>> >>> model.build_vocab(new_sentences, update=True) # Update the vocabulary >>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) + >>> >>> new_vector = model.wv['computation'] >>> np.allclose(old_vector, new_vector, atol=1e-4) # Vector has changed, model has learnt something False From 7d2b562c72c0a54fb5b37c4b22decb8b9a4d0f29 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 19:31:09 +1100 Subject: [PATCH 26/39] review response: remove blank line --- gensim/models/fasttext.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 1dc54be39e..183bfc76e6 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -1059,7 +1059,6 @@ class FastTextTrainables(Word2VecTrainables): Attributes ---------- - hashfxn : function Used for randomly initializing weights. Defaults to the built-in hash() layer1_size : int From 25b24c7fe391c9a588e59fe2f19f6558bc424d6b Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 19:32:05 +1100 Subject: [PATCH 27/39] review response: add emphasis --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 183bfc76e6..f4928026e6 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -952,7 +952,7 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): Specifies the file encoding. full_model : boolean, optional If False, skips loading the hidden output matrix. This saves a fair bit - of CPU time and RAM, but prevents training continuation. + of CPU time and RAM, but **prevents training continuation**. Returns ------- From b4e8405490a054135f0da9c6db19fc199d719fb4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 19:42:54 +1100 Subject: [PATCH 28/39] review response: add comment --- gensim/models/fasttext.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index f4928026e6..9e4b0a7922 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -155,7 +155,8 @@ .. Important:: Be sure to call the :meth:`~gensim.models.fasttext.FastText.build_vocab` method before the :meth:`~gensim.models.fasttext.FastText.train` method - when continuing training. + when continuing training. Without this call, previously unseen terms + will notbe added to the vocabulary. You can also load models trained with Facebook's fastText implementation: From 1fc9bf28d113c09ce6ff5d0459e27b1e6ba28edf Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 19:43:04 +1100 Subject: [PATCH 29/39] review response: add example --- gensim/models/fasttext.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 9e4b0a7922..272df3a5cf 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -955,6 +955,44 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): If False, skips loading the hidden output matrix. This saves a fair bit of CPU time and RAM, but **prevents training continuation**. + Examples + -------- + + Load, infer, continue training: + + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True) + >>> + >>> 'landlord' in fb_full.wv.vocab # Word is out of vocabulary + False + >>> oov_term = fb_full.wv['landlord'] + >>> + >>> 'landlady' in fb_full.wv.vocab # Word is in the vocabulary + True + >>> iv_term = fb_full.wv['landlady'] + >>> + >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']] + >>> fb_full.build_vocab(new_sent, update=True) + >>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) + + Load quickly, infer (forego training continuation): + + .. sourcecode:: pycon + + >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) + >>> + >>> 'landlord' in fb_partial.wv.vocab # Word is out of vocabulary + False + >>> oov_term = fb_partial.wv['landlord'] + >>> + >>> 'landlady' in fb_partial.wv.vocab # Word is in the vocabulary + True + >>> iv_term = fb_partial.wv['landlady'] + Returns ------- gensim.models.fasttext.FastText From 72ec312598f89d2450af774220f334a7f5aab56f Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Jan 2019 23:11:21 +1100 Subject: [PATCH 30/39] review response: remove redundant line --- gensim/models/fasttext.py | 2 -- gensim/models/keyedvectors.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 272df3a5cf..69f81c2e15 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -411,8 +411,6 @@ class FastText(BaseWordEmbeddingsModel): :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`. - Some important internal attributes are the following: - Attributes ---------- wv : :class:`~gensim.models.keyedvectors.FastTextKeyedVectors` diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 17d1b18167..09c9f10e17 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1943,8 +1943,6 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors): If True, uses the Facebook-compatible hash function instead of the Gensim backwards-compatible hash function. - Some important attributes: - Attributes ---------- vectors_vocab : np.array From 29c4fafad8e1368b0e920a773bab3d255ee1a51a Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 28 Jan 2019 19:08:48 +1100 Subject: [PATCH 31/39] review response: update comment --- gensim/models/fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 69f81c2e15..61d7a57441 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -154,9 +154,9 @@ .. Important:: Be sure to call the :meth:`~gensim.models.fasttext.FastText.build_vocab` - method before the :meth:`~gensim.models.fasttext.FastText.train` method + method with `update=True` before the :meth:`~gensim.models.fasttext.FastText.train` method when continuing training. Without this call, previously unseen terms - will notbe added to the vocabulary. + will not be added to the vocabulary. You can also load models trained with Facebook's fastText implementation: From 74410fc93359819d7e1f7cfddbc1b96beeaf595d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 28 Jan 2019 23:13:51 +1100 Subject: [PATCH 32/39] Update gensim/models/fasttext.py Co-Authored-By: mpenkov --- gensim/models/fasttext.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 61d7a57441..25a543abfd 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -53,7 +53,11 @@ >>> model2 = FastText(size=4, window=3, min_count=1, sentences=common_texts, iter=10) .. Important:: - We intend to deprecate this second method of passing everything through the constructor. + This style of initialize-and-train in a single line is **deprecated**. We include it here + for backward compatibility only. + + Please use the initialize-`build_vocab`-`train` pattern above instead, including using `epochs` + instead of `iter`. The motivation is to simplify the API and resolve naming inconsistencies, e.g. the iter parameter to the constructor is called epochs in the train function. From a3456a4cfd575b9fbfb900630d640a78a25cac72 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 28 Jan 2019 23:30:52 +1100 Subject: [PATCH 33/39] review response: improve examples --- gensim/models/fasttext.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 25a543abfd..b2d2f01054 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -87,15 +87,14 @@ >>> model3 = FastText(size=4, window=3, min_count=1) >>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary >>> - >>> total_examples = model3.corpus_count # number of sentences in the corpus >>> total_words = model3.corpus_total_words # number of words in the corpus - >>> model3.train(corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=5) + >>> model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5) -The model needs the `total_examples` and `total_words` parameters in order to +The model needs the `total_words` parameter in order to manage the training rate (alpha) correctly, and to give accurate progress estimates. The above example relies on an implementation detail: the :meth:`~gensim.models.fasttext.FastText.build_vocab` method -sets the `corpus_count` and `corpus_total_words` model attributes. +sets the `corpus_total_words` (and also `corpus_count`) model attributes. You may calculate them by scanning over the corpus yourself, too. If you have a corpus in a different format, then you can use it by wrapping it @@ -105,7 +104,7 @@ .. sourcecode:: pycon - >>> class MyIter: + >>> class MyIter(object): ... def __iter__(self): ... with open(datapath('crime-and-punishment.txt')) as fin: ... for line in fin: From 96eab08fbeeec8f7be8ec9583e9de4ac1f7fea4a Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 28 Jan 2019 23:32:46 +1100 Subject: [PATCH 34/39] clarify example --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b2d2f01054..9dc722d35d 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -99,7 +99,7 @@ If you have a corpus in a different format, then you can use it by wrapping it in an `iterator `_. -Your iterator should yield a list of strings each time. +Your iterator should yield a list of strings each time, where each string should be a separate word. Gensim will take care of the rest: .. sourcecode:: pycon From ff721851a05fa4dd968664f4d19910d7a89b4cbb Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jan 2019 10:16:43 +1100 Subject: [PATCH 35/39] review response: improve example --- gensim/models/fasttext.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 9dc722d35d..c19b59a6ec 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -104,11 +104,15 @@ .. sourcecode:: pycon + >>> import smart_open + >>> + >>> >>> class MyIter(object): ... def __iter__(self): - ... with open(datapath('crime-and-punishment.txt')) as fin: + ... path = datapath('crime-and-punishment.txt') + ... with smart_open.smart_open(path, 'r', encoding='utf-8') as fin: ... for line in fin: - ... yield line.lower().strip().split(" ") + ... yield line.lower().strip().split() >>> >>> >>> model4 = FastText(size=4, window=3, min_count=1) @@ -935,6 +939,10 @@ def __contains__(self, word): def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. + By default, this function loads the full model. + A full model allows continuing training with more data, but also consumes more RAM and takes longer to load. + If you do not need to continue training and only wish the work with the already-trained embeddings, use `partial=False` for faster loading and to save RAM. + Notes ------ Facebook provides both `.vec` and `.bin` files with their modules. From 9140cf6f27dcd04c5f6acb10f8f1d3de828eb291 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jan 2019 10:21:55 +1100 Subject: [PATCH 36/39] review response: improve tokenization in example --- gensim/models/fasttext.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index c19b59a6ec..f5309d68f7 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -104,6 +104,7 @@ .. sourcecode:: pycon + >>> from gensim.utils import tokenize >>> import smart_open >>> >>> @@ -112,7 +113,7 @@ ... path = datapath('crime-and-punishment.txt') ... with smart_open.smart_open(path, 'r', encoding='utf-8') as fin: ... for line in fin: - ... yield line.lower().strip().split() + ... yield list(tokenize(line)) >>> >>> >>> model4 = FastText(size=4, window=3, min_count=1) From 31c79c3414b0b7e0f83037ff0616637885256f45 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jan 2019 11:50:49 +1100 Subject: [PATCH 37/39] flake8 --- gensim/models/fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index f5309d68f7..048484d486 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -54,8 +54,8 @@ .. Important:: This style of initialize-and-train in a single line is **deprecated**. We include it here - for backward compatibility only. - + for backward compatibility only. + Please use the initialize-`build_vocab`-`train` pattern above instead, including using `epochs` instead of `iter`. The motivation is to simplify the API and resolve naming inconsistencies, From 2f479caf79ee90b2c62da06ebb979df3b5465068 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jan 2019 14:23:31 +1100 Subject: [PATCH 38/39] fix long lines --- gensim/models/fasttext.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 048484d486..f34852dda9 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -940,9 +940,11 @@ def __contains__(self, word): def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. - By default, this function loads the full model. - A full model allows continuing training with more data, but also consumes more RAM and takes longer to load. - If you do not need to continue training and only wish the work with the already-trained embeddings, use `partial=False` for faster loading and to save RAM. + By default, this function loads the full model. A full model allows + continuing training with more data, but also consumes more RAM and + takes longer to load. If you do not need to continue training and only + wish the work with the already-trained embeddings, use `partial=False` + for faster loading and to save RAM. Notes ------ From c48a7f3300e299ef140749f65d965a3b2eff210b Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jan 2019 14:55:31 +1100 Subject: [PATCH 39/39] fixup: use correct parameter name --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index f34852dda9..712ad93a0c 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -943,7 +943,7 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): By default, this function loads the full model. A full model allows continuing training with more data, but also consumes more RAM and takes longer to load. If you do not need to continue training and only - wish the work with the already-trained embeddings, use `partial=False` + wish the work with the already-trained embeddings, use `full_model=False` for faster loading and to save RAM. Notes