piskvorky · menshikh-iv · Jan 29, 2019 · Jan 24, 2019 · Jan 25, 2019 · Jan 25, 2019
diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """Load models from the native binary format released by Facebook.
 
+The main entry point is the :py:func:`load` function.
+It returns a :py:class:`Model` namedtuple containing everything loaded from the binary.
+
 Examples
 --------
 

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -14,6 +14,9 @@
 This module contains a fast native C implementation of Fasttext with Python interfaces. It is **not** only a wrapper
 around Facebook's implementation.
 
+This module supports loading models trained with Facebook's fastText implementation.
+It also supports continuing training from such models.
+
 For a tutorial see `this notebook
 <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb>`_.
 
@@ -31,6 +34,15 @@
     >>> from gensim.models import FastText
     >>>
     >>> model = FastText(common_texts, size=4, window=3, min_count=1, iter=10)
+    >>> sentences = [
+    ...     ['computer', 'artificial', 'intelligence'],
+    ...     ['artificial', 'trees'],
+    ...     ['human', 'intelligence'],
+    ...     ['artificial', 'graph'],
+    ...     ['intelligence'],
+    ...     ['artificial', 'intelligence', 'system']
+    ... ]
+    >>> model.train(sentences, total_examples=len(sentences), epochs=model.epochs)
 
 Persist a model to disk with:
 
@@ -41,7 +53,49 @@
     >>> fname = get_tmpfile("fasttext.model")
     >>>
     >>> model.save(fname)
-    >>> model = FastText.load(fname)  # you can continue training with the loaded model!
+    >>> model = FastText.load(fname)
+
+Once loaded, such models behave identically to those created from scratch.
+For example, you can continue training the loaded model:
+
+    >>> new_sentences = [
+    ...     ['sweet', 'child', 'of', 'mine'],
+    ...     ['rocket', 'queen'],
+    ...     ['you', 'could', 'be', 'mine'],
+    ...     ['november', 'rain'],
+    ... ]
+    >>> 'rocket' in model.wv
+    False
+    >>> model.train(new_sentences, total_examples=len(sentences), epochs=model.epochs)
+    >>> 'rocket' in model.wv
+    True
+
+You can also load models trained with Facebook's fastText implementation:
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import datapath
+    >>> cap_path = datapath("crime-and-punishment.bin")
+    >>> # Partial model: loads quickly, uses less RAM, but cannot continue training
+    >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
+    >>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
+    >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
+
+Once loaded, such models behave identically to those trained from scratch.
+You may continue training them on new data:
+
+.. sourcecode:: pycon
+
+    >>> 'computer' in fb_full.wv.vocab  # New word, currently out of vocab
+    False
+    >>> 'rocket' in fb_full.wv.vocab
+    False
+    >>> fb_full.train(sentences, total_examples=len(sentences), epochs=model.epochs)
+    >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
+    >>> 'computer' in fb_full.wv.vocab  # We have learned this word now
+    True
+    >>> 'rocket' in fb_full.wv.vocab
+    True
 
 Retrieve word-vector for vocab and out-of-vocab word:
 
@@ -85,6 +139,28 @@
 
     >>> analogies_result = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
 
+Implementation Notes
+--------------------
+
+These notes may help developers navigate our fastText implementation.
+Our FastText implementation is split across several submodules:
+
+- :py:mod:`gensim.models.fasttext`: This module.  Contains FastText-specific functionality only.
+- :py:mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality.
+- :py:mod:`gensim.models.word2vec`:
+- :py:mod:`gensim.models.base_any2vec`:
+- :py:mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions.
+
+Our implementation relies heavily on inheritance.
+It consists of several important classes:
+
+- :py:class:`FastTextVocab`: the vocabulary.  Redundant, simply wraps its superclass.
+- :py:class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors.
+  Once training is complete, this class is sufficient for calculating embeddings.
+- :py:class:`FastTextTrainables`: the underlying neural network.  The implementation
+  uses this class to *learn* the word embeddings.
+- :py:class:`FastText`: ties everything together.
+
 """
 
 import logging
@@ -759,7 +835,8 @@ def load_fasttext_format(cls, model_file, encoding='utf8'):
 
         Notes
         ------
-        Due to limitations in the FastText API, you cannot continue training with a model loaded this way.
+        This function effectively ignores `.vec` output file.
+        It only needs the `.bin` file.
 
         Parameters
         ----------
@@ -773,7 +850,7 @@ def load_fasttext_format(cls, model_file, encoding='utf8'):
 
         Returns
         -------
-        :class: `~gensim.models.fasttext.FastText`
+        gensim.models.fasttext.FastText
             The loaded model.
 
         """
@@ -862,15 +939,44 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
         return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
 
 
-#
-# Keep for backward compatibility.
-#
 class FastTextVocab(Word2VecVocab):
+    """This is a redundant class.  It exists only to maintain backwards compatibility
+    with older gensim versions."""
     pass
 
 
 class FastTextTrainables(Word2VecTrainables):
-    """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`."""
+    """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`.
+
+    Mostly inherits from its parent (:py:class:`gensim.models.word2vec.Word2VecTrainables`).
+    Adds logic for calculating and maintaining ngram weights.
+
+    Attributes
+    ----------
+
+    hashfxn : function
+        Used for randomly initializing weights.  Defaults to the built-in hash()
+    layer1_size : int
+        The size of the inner layer of the NN.  Equal to the vector dimensionality.  Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor.
+    seed : float
+        The random generator seed used in reset_weights and update_weights
+    syn1 : numpy.array
+        The inner layer of the NN.  Each row corresponds to a term in the vocabulary.  Columns correspond to weights of the inner layer.  There are layer1_size such weights.  Set in the reset_weights and update_weights methods, only if hierarchical sampling is used.
+    syn1neg : numpy.array
+        Similar to syn1, but only set if negative sampling is used.
+    vectors_lockf : numpy.array
+        A one-dimensional array with one element for each term in the vocab.  Set in reset_weights to an array of ones.
+    vectors_vocab_lockf : numpy.array
+        Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL)
+    vectors_ngrams_lockf : numpy.array
+        np.ones((self.bucket, wv.vector_size), dtype=REAL)
+
+    Notes
+    -----
+
+    The lockf stuff looks like it gets used by the fast C implementation.
+
+    """
     def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000):
         super(FastTextTrainables, self).__init__(
             vector_size=vector_size, seed=seed, hashfxn=hashfxn)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -1936,14 +1936,19 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors):
         If True, uses the Facebook-compatible hash function instead of the
         Gensim backwards-compatible hash function.
 
+    Some important attributes:
+
     Attributes
     ----------
     vectors_vocab : np.array
-        A vector for each entity in the vocabulary.
+        Each row corresponds to a vector for an entity in the vocabulary.
+        Columns correspond to vector dimensions.
     vectors_vocab_norm : np.array
         Same as vectors_vocab, but the vectors are L2 normalized.
     vectors_ngrams : np.array
         A vector for each ngram across all entities in the vocabulary.
+        Each row is a vector that corresponds to a bucket.
+        Columns correspond to vector dimensions.
     vectors_ngrams_norm : np.array
         Same as vectors_ngrams, but the vectors are L2 normalized.
         Under some conditions, may actually be the same matrix as
@@ -1957,7 +1962,8 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors):
         bucket to an index, and then indexing into vectors_ngrams (in other
         words, vectors_ngrams[hash2index[hash_fn(ngram) % bucket]].
     num_ngram_vectors : int
-        TODO
+        The number of vectors that correspond to ngrams, as opposed to terms
+        (full words).
 
     """
     def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash):