diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst
index cbe06b6fb5..ca3c1ec019 100644
--- a/docs/src/auto_examples/index.rst
+++ b/docs/src/auto_examples/index.rst
@@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a
.. raw:: html
-
+
@@ -33,9 +33,10 @@ Understanding this functionality is vital for using gensim effectively.
.. only:: html
- .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png
+ :alt: Core Concepts
- :ref:`sphx_glr_auto_examples_core_run_core_concepts.py`
+ :ref:`sphx_glr_auto_examples_core_run_core_concepts.py`
.. raw:: html
@@ -53,9 +54,10 @@ Understanding this functionality is vital for using gensim effectively.
.. only:: html
- .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png
+ :alt: Corpora and Vector Spaces
- :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`
+ :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py`
.. raw:: html
@@ -73,9 +75,10 @@ Understanding this functionality is vital for using gensim effectively.
.. only:: html
- .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png
+ :alt: Topics and Transformations
- :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py`
+ :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py`
.. raw:: html
@@ -93,9 +96,10 @@ Understanding this functionality is vital for using gensim effectively.
.. only:: html
- .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png
+ .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png
+ :alt: Similarity Queries
- :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py`
+ :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py`
.. raw:: html
@@ -108,7 +112,7 @@ Understanding this functionality is vital for using gensim effectively.
/auto_examples/core/run_similarity_queries
.. raw:: html
-
+
@@ -127,9 +131,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
.. only:: html
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png
+ :alt: Word2Vec Model
- :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`
+ :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py`
.. raw:: html
@@ -147,9 +152,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
.. only:: html
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png
+ :alt: Doc2Vec Model
- :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py`
+ :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py`
.. raw:: html
@@ -167,9 +173,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
.. only:: html
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png
+ :alt: FastText Model
- :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
+ :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`
.. raw:: html
@@ -187,9 +194,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
.. only:: html
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png
+ :alt: Fast Similarity Queries with Annoy and Word2Vec
- :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
+ :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py`
.. raw:: html
@@ -207,9 +215,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
.. only:: html
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png
+ :alt: LDA Model
- :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
+ :ref:`sphx_glr_auto_examples_tutorials_run_lda.py`
.. raw:: html
@@ -227,9 +236,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
.. only:: html
- .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
+ .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png
+ :alt: Word Mover's Distance
- :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
+ :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py`
.. raw:: html
@@ -242,7 +252,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod
/auto_examples/tutorials/run_wmd
.. raw:: html
-
+
@@ -261,9 +271,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
.. only:: html
- .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png
+ :alt: How to download pre-trained models and corpora
- :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py`
+ :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py`
.. raw:: html
@@ -281,9 +292,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
.. only:: html
- .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png
+ :alt: How to Author Gensim Documentation
- :ref:`sphx_glr_auto_examples_howtos_run_doc.py`
+ :ref:`sphx_glr_auto_examples_howtos_run_doc.py`
.. raw:: html
@@ -301,9 +313,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
.. only:: html
- .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png
+ :alt: How to reproduce the doc2vec 'Paragraph Vector' paper
- :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py`
+ :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py`
.. raw:: html
@@ -321,9 +334,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
.. only:: html
- .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png
+ .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png
+ :alt: How to Compare LDA Models
- :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py`
+ :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py`
.. raw:: html
@@ -336,7 +350,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u
/auto_examples/howtos/run_compare_lda
.. raw:: html
-
+
@@ -379,7 +393,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
.. raw:: html
-
+
@@ -389,15 +403,15 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from
:class: sphx-glr-footer-gallery
- .. container:: sphx-glr-download
+ .. container:: sphx-glr-download sphx-glr-download-python
- :download:`Download all examples in Python source code: auto_examples_python.zip /home/misha/git/gensim/docs/src/auto_examples/auto_examples_python.zip>`
+ :download:`Download all examples in Python source code: auto_examples_python.zip /Volumes/work/workspace/gensim/trunk/docs/src/auto_examples/auto_examples_python.zip>`
- .. container:: sphx-glr-download
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
- :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip /home/misha/git/gensim/docs/src/auto_examples/auto_examples_jupyter.zip>`
+ :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip /Volumes/work/workspace/gensim/trunk/docs/src/auto_examples/auto_examples_jupyter.zip>`
.. only:: html
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 9460619db8..959604a4fc 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -11,7 +11,7 @@
* `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
`_
-* `"Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma
+* `"Normalized (Pointwise) Mutual Information in Collocation Extraction" by Gerlof Bouma
`_
@@ -21,39 +21,42 @@
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
- >>> from gensim.models.phrases import Phrases, Phraser
+ >>> from gensim.models.phrases import Phrases
>>>
- >>> # Load training data.
+ >>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator).
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
- >>> # The training corpus must be a sequence (stream, generator) of sentences,
- >>> # with each sentence a list of tokens:
- >>> print(list(sentences)[0][:10])
+ >>> # Each sentence must be a list of string tokens:
+ >>> first_sentence = next(iter(sentences))
+ >>> print(first_sentence[:10])
['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']
>>>
- >>> # Train a toy bigram model.
- >>> phrases = Phrases(sentences, min_count=1, threshold=1)
+ >>> # Train a toy phrase model on our training corpus.
+ >>> phrase_model = Phrases(sentences, delimiter='_', min_count=1, threshold=1)
+ >>>
>>> # Apply the trained phrases model to a new, unseen sentence.
- >>> phrases[['trees', 'graph', 'minors']]
+ >>> new_sentence = ['trees', 'graph', 'minors']
+ >>> phrase_model[new_sentence]
['trees_graph', 'minors']
>>> # The toy model considered "trees graph" a single phrase => joined the two
- >>> # tokens into a single token, `trees_graph`.
+ >>> # tokens into a single "phrase" token, using our selected `_` delimiter.
+ >>>
+ >>> # Apply the trained model to each sentence of a corpus, using the same [] syntax:
+ >>> for sent in phrase_model[sentences]:
+ ... pass
>>>
>>> # Update the model with two new sentences on the fly.
- >>> phrases.add_vocab([["hello", "world"], ["meow"]])
+ >>> phrase_model.add_vocab([["hello", "world"], ["meow"]])
>>>
>>> # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
- >>> bigram = Phraser(phrases)
- >>> bigram[['trees', 'graph', 'minors']] # apply the exported model to a sentence
+ >>> frozen_model = phrase_model.freeze()
+ >>> # Apply the frozen model; same results as before:
+ >>> frozen_model[new_sentence]
['trees_graph', 'minors']
>>>
- >>> # Apply the exported model to each sentence of a corpus:
- >>> for sent in bigram[sentences]:
- ... pass
- >>>
- >>> # Save / load an exported collocation model.
- >>> bigram.save("/tmp/my_bigram_model.pkl")
- >>> bigram_reloaded = Phraser.load("/tmp/my_bigram_model.pkl")
- >>> bigram_reloaded[['trees', 'graph', 'minors']] # apply the exported model to a sentence
+ >>> # Save / load models.
+ >>> frozen_model.save("/tmp/my_phrase_model.pkl")
+ >>> model_reloaded = Phrases.load("/tmp/my_phrase_model.pkl")
+ >>> model_reloaded[['trees', 'graph', 'minors']] # apply the reloaded model to a sentence
['trees_graph', 'minors']
"""
@@ -62,7 +65,6 @@
import os
import logging
from collections import defaultdict
-import functools
import itertools
from math import log
import pickle
@@ -73,6 +75,89 @@
logger = logging.getLogger(__name__)
+NEGATIVE_INFINITY = float('-inf')
+
+
+def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+ r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
+ of Words and Phrases and their Compositionality" `_.
+
+ Parameters
+ ----------
+ worda_count : int
+ Number of occurrences for first word.
+ wordb_count : int
+ Number of occurrences for second word.
+ bigram_count : int
+ Number of co-occurrences for phrase "worda_wordb".
+ len_vocab : int
+ Size of vocabulary.
+ min_count: int
+ Minimum collocation count threshold.
+ corpus_word_count : int
+ Not used in this particular scoring technique.
+
+ Returns
+ -------
+ float
+ Score for given bi-gram, greater than or equal to 0.
+
+ Notes
+ -----
+ Formula: :math:`\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`.
+
+ """
+ denom = worda_count * wordb_count
+ if denom == 0:
+ return NEGATIVE_INFINITY
+ return (bigram_count - min_count) / float(denom) * len_vocab
+
+
+def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+ r"""Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction"
+ by Gerlof Bouma `_.
+
+ Parameters
+ ----------
+ worda_count : int
+ Number of occurrences for first word.
+ wordb_count : int
+ Number of occurrences for second word.
+ bigram_count : int
+ Number of co-occurrences for phrase "worda_wordb".
+ len_vocab : int
+ Not used.
+ min_count: int
+ Ignore all bigrams with total collected count lower than this value.
+ corpus_word_count : int
+ Total number of words in the corpus.
+
+ Returns
+ -------
+ float
+ If bigram_count >= min_count, return the collocation score, in the range -1 to 1.
+ Otherwise return -inf.
+
+ Notes
+ -----
+ Formula: :math:`\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`,
+ where :math:`prob(word) = \frac{word\_count}{corpus\_word\_count}`
+
+ """
+ if bigram_count >= min_count:
+ corpus_word_count = float(corpus_word_count)
+ pa = worda_count / corpus_word_count
+ pb = wordb_count / corpus_word_count
+ pab = bigram_count / corpus_word_count
+ try:
+ return log(pab / (pa * pb)) / -log(pab)
+ except ValueError: # some of the counts were zero => never a phrase
+ return NEGATIVE_INFINITY
+ else:
+ # Return -infinity to make sure that no phrases will be created
+ # from bigrams less frequent than min_count.
+ return NEGATIVE_INFINITY
+
def _is_single(obj):
"""Check whether `obj` is a single document or an entire corpus.
@@ -84,7 +169,8 @@ def _is_single(obj):
Return
------
(bool, object)
- (is_single, new) tuple, where `new` yields the same sequence as `obj`.
+ 2-tuple ``(is_single_document, new_obj)`` tuple, where `new_obj`
+ yields the same sequence as the original `obj`.
Notes
-----
@@ -97,217 +183,238 @@ def _is_single(obj):
peek = next(obj_iter)
obj_iter = itertools.chain([peek], obj_iter)
except StopIteration:
- # An empty object is a single document
+ # An empty object is interpreted as a single document (not a corpus).
return True, obj
if isinstance(peek, str):
- # It's a document, return the iterator
+ # First item is a string => obj is a single document for sure.
return True, obj_iter
if temp_iter is obj:
- # Checking for iterator to the object
+ # An iterator / generator => interpret input as a corpus.
return False, obj_iter
- else:
- # If the first item isn't a string, assume obj is a corpus
- return False, obj
+ # If the first item isn't a string, assume obj is an iterable corpus.
+ return False, obj
-class SentenceAnalyzer:
- """Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`."""
- def score_item(self, worda, wordb, components, scorer):
- """Get bi-gram score statistics.
+class _PhrasesTransformation(interfaces.TransformationABC):
+ """
+ Abstract base class for :class:`~gensim.models.phrases.Phrases` and
+ :class:`~gensim.models.phrases.FrozenPhrases`.
- Parameters
- ----------
- worda : str
- First word of bi-gram.
- wordb : str
- Second word of bi-gram.
- components : generator
- Contain all phrases.
- scorer : function
- Scorer function, as given to :class:`~gensim.models.phrases.Phrases`.
- See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`.
+ """
+ def __init__(self, common_terms):
+ self.common_terms = frozenset(common_terms)
+
+ def score_candidate(self, word_a, word_b, in_between):
+ """Score a single phrase candidate.
Returns
-------
- float
- Score for given bi-gram. If bi-gram not present in dictionary - return -1.
-
+ (str, float)
+ 2-tuple of ``(delimiter-joined phrase, phrase score)`` for a phrase,
+ or ``(None, None)`` if not a phrase.
"""
- vocab = self.vocab
- if worda in vocab and wordb in vocab:
- bigram = self.delimiter.join(components)
- if bigram in vocab:
- return scorer(
- worda_count=float(vocab[worda]),
- wordb_count=float(vocab[wordb]),
- bigram_count=float(vocab[bigram]))
- return -1
-
- def analyze_sentence(self, sentence, threshold, common_terms, scorer):
- """Analyze a sentence, detecting any bigrams that should be concatenated.
+ raise NotImplementedError("ABC: override this method in child classes")
+
+ def analyze_sentence(self, sentence):
+ """Analyze a sentence, concatenating any detected phrases into a single token.
Parameters
----------
sentence : iterable of str
Token sequence representing the sentence to be analyzed.
- threshold : float
- The minimum score for a bigram to be taken into account.
- common_terms : list of object
- List of common terms, they receive special treatment.
- scorer : function
- Scorer function, as given to :class:`~gensim.models.phrases.Phrases`.
- See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`.
Yields
------
(str, score)
- If bi-gram detected, a tuple where the first element is a detect bigram, second its score.
- Otherwise, the first tuple element is a single word and second is None.
+ Iterate through the input sentence tokens and yield 2-tuples of:
+ - ``(concatenated_phrase_tokens, score)`` for token sequences that form a phrase.
+ - ``(word, None)`` if the token is not a part of a phrase.
"""
- s = [utils.any2utf8(w) for w in sentence]
- # adding None is a trick that helps getting an automatic happy ending
- # as it won't be a common_word, nor score
- s.append(None)
- last_uncommon = None
- in_between = []
- for word in s:
- is_common = word in common_terms
- if not is_common and last_uncommon:
- chain = [last_uncommon] + in_between + [word]
- # test between last_uncommon
- score = self.score_item(
- worda=last_uncommon,
- wordb=word,
- components=chain,
- scorer=scorer,
- )
- if score > threshold:
- yield (chain, score)
- last_uncommon = None
- in_between = []
+ start_token, in_between = None, []
+ for word in sentence:
+ if word not in self.common_terms:
+ # The current word is a normal token, not a stop word, which means it's a potential
+ # beginning (or end) of a phrase.
+ if start_token:
+ # We're inside a potential phrase, of which this word is the end.
+ phrase, score = self.score_candidate(start_token, word, in_between)
+ if score is not None:
+ # Phrase detected!
+ yield phrase, score
+ start_token, in_between = None, []
+ else:
+ # Not a phrase after all. Dissolve the candidate's constituent tokens as individual words.
+ yield start_token, None
+ for w in in_between:
+ yield w, None
+ start_token, in_between = word, [] # new potential phrase starts here
else:
- # release words individually
- for w in itertools.chain([last_uncommon], in_between):
- yield (w, None)
- in_between = []
- last_uncommon = word
- elif not is_common:
- last_uncommon = word
- else: # common term
- if last_uncommon:
- # wait for uncommon resolution
+ # Not inside a potential bigram yet; start a new potential bigram here.
+ start_token, in_between = word, []
+ else: # We're a stop word.
+ if start_token:
+ # We're inside a potential bigram: add the stopword and keep growing the phrase.
in_between.append(word)
else:
- yield (word, None)
+ # Not inside a bigram: emit the stopword and move on. Phrases never begin with a stopword.
+ yield word, None
+ # Emit any non-phrase tokens at the end.
+ if start_token:
+ yield start_token, None
+ for w in in_between:
+ yield w, None
+ def __getitem__(self, sentence):
+ """Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent
+ tokens are replaced by a single token if they form a bigram collocation.
-class PhrasesTransformation(interfaces.TransformationABC):
- """Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`."""
+ If `sentence` is an entire corpus (iterable of sentences rather than a single
+ sentence), return an iterable that converts each of the corpus' sentences
+ into phrases on the fly, one after another.
+
+ Parameters
+ ----------
+ sentence : {list of str, iterable of list of str}
+ Input sentence or a stream of sentences.
+
+ Return
+ ------
+ {list of str, iterable of list of str}
+ Sentence with phrase tokens joined by `self.delimiter` character, if input was a single sentence.
+ A generator of such joined sentences if input was a corpus.
+
+ """
+ is_single, sentence = _is_single(sentence)
+ if not is_single:
+ # If the input is an entire corpus (rather than a single sentence),
+ # return an iterable stream.
+ return self._apply(sentence)
+
+ return [token for token, _ in self.analyze_sentence(sentence)]
+
+ def export_phrases(self, sentences):
+ """Get all unique phrases (multi-word expressions) that appear in ``sentences``, and their scores.
+
+ Parameters
+ ----------
+ sentences : iterable of list of str
+ Text corpus.
+
+ Returns
+ -------
+ dict(str, float)
+ Unique phrases mapped to their scores.
+
+ Example
+ -------
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.models.word2vec import Text8Corpus
+ >>> from gensim.models.phrases import Phrases
+ >>>
+ >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
+ >>> phrases = Phrases(sentences, min_count=1, threshold=0.1)
+ >>>
+ >>> for phrase, score in phrases.export_phrases(sentences).items():
+ ... print(phrase, score)
+ """
+ result = {}
+ for sentence in sentences:
+ for phrase, score in self.analyze_sentence(sentence):
+ if score is not None:
+ result[phrase] = score
+ return result
@classmethod
def load(cls, *args, **kwargs):
"""Load a previously saved :class:`~gensim.models.phrases.Phrases` /
- :class:`~gensim.models.phrases.Phraser` class. Handles backwards compatibility from older
- :class:`~gensim.models.phrases.Phrases` / :class:`~gensim.models.phrases.Phraser`
- versions which did not support pluggable scoring functions.
+ :class:`~gensim.models.phrases.FrozenPhrases` model.
+
+ Handles backwards compatibility from older versions which did not support pluggable scoring functions.
Parameters
----------
args : object
- Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information.
+ See :class:`~gensim.utils.SaveLoad.load`.
kwargs : object
- Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information.
+ See :class:`~gensim.utils.SaveLoad.load`.
"""
- model = super(PhrasesTransformation, cls).load(*args, **kwargs)
- # update older models
- # if value in phrasegrams dict is a tuple, load only the scores.
+ model = super(_PhrasesTransformation, cls).load(*args, **kwargs)
- for component, score in getattr(model, "phrasegrams", {}).items():
+ # Upgrade FrozenPhrases
+ try:
+ phrasegrams = getattr(model, "phrasegrams", {})
+ component, score = next(iter(phrasegrams.items()))
if isinstance(score, tuple):
- frequency, score_val = score
- model.phrasegrams[component] = score_val
-
- # if no scoring parameter, use default scoring
+ # Value in phrasegrams used to be a tuple; keep only the 2nd tuple component = score.
+ model.phrasegrams = {
+ str(model.delimiter.join(key), encoding='utf8'): val[1]
+ for key, val in phrasegrams.items()
+ }
+ elif isinstance(component, tuple): # 3.8 => 4.0: phrasegram keys are strings, not tuples with bytestrings
+ model.phrasegrams = {
+ str(model.delimiter.join(component), encoding='utf8'): score
+ for key, val in phrasegrams.items()
+ }
+ except StopIteration:
+ # no phrasegrams, nothing to upgrade
+ pass
+
+ # If no scoring parameter, use default scoring.
if not hasattr(model, 'scoring'):
- logger.info('older version of %s loaded without scoring function', cls.__name__)
- logger.info('setting pluggable scoring method to original_scorer for compatibility')
+ logger.warning('older version of %s loaded without scoring function', cls.__name__)
+ logger.warning('setting pluggable scoring method to original_scorer for compatibility')
model.scoring = original_scorer
- # if there is a scoring parameter, and it's a text value, load the proper scoring function
+ # If there is a scoring parameter, and it's a text value, load the proper scoring function.
if hasattr(model, 'scoring'):
if isinstance(model.scoring, str):
if model.scoring == 'default':
- logger.info('older version of %s loaded with "default" scoring parameter', cls.__name__)
- logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility')
+ logger.warning('older version of %s loaded with "default" scoring parameter', cls.__name__)
+ logger.warning('setting scoring method to original_scorer for compatibility')
model.scoring = original_scorer
elif model.scoring == 'npmi':
- logger.info('older version of %s loaded with "npmi" scoring parameter', cls.__name__)
- logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility')
+ logger.warning('older version of %s loaded with "npmi" scoring parameter', cls.__name__)
+ logger.warning('setting scoring method to npmi_scorer for compatibility')
model.scoring = npmi_scorer
else:
- raise ValueError(
- 'failed to load %s model with unknown scoring setting %s' % (cls.__name__, model.scoring))
- # if there is no common_terms attribute, initialize
+ raise ValueError(f'failed to load {cls.__name__} model, unknown scoring "{model.scoring}"')
+ # Initialize new attributes to default values.
if not hasattr(model, "common_terms"):
- logger.info('older version of %s loaded without common_terms attribute', cls.__name__)
- logger.info('setting common_terms to empty set')
+ logger.warning(
+ 'older version of %s loaded without common_terms attribute, setting it to empty set',
+ cls.__name__,
+ )
model.common_terms = frozenset()
- return model
-
-
-def _sentence2token(phrase_class, sentence):
- """ Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter.
- This function is used by: meth:`~gensim.models.phrases.Phrases.__getitem__` and
- meth:`~gensim.models.phrases.Phraser.__getitem__`
-
- Parameters
- ----------
- phrase_class :
- class:`~gensim.models.phrases.Phrases` or :class:`~gensim.models.phrases.Phraser`
- sentence : {list of str, iterable of list of str}
- Sentence or text corpus.
-
- Returns
- -------
- {list of str, :class:`~gensim.interfaces.TransformedCorpus`}
- `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences
- if the input was a corpus.
-
- """
- is_single, sentence = _is_single(sentence)
- if not is_single:
- # if the input is an entire corpus (rather than a single sentence),
- # return an iterable stream.
- return phrase_class._apply(sentence)
-
- delimiter = phrase_class.delimiter
- if hasattr(phrase_class, 'vocab'):
- scorer = functools.partial(
- phrase_class.scoring,
- len_vocab=float(len(phrase_class.vocab)),
- min_count=float(phrase_class.min_count),
- corpus_word_count=float(phrase_class.corpus_word_count))
- else:
- scorer = None
- bigrams = phrase_class.analyze_sentence(sentence, threshold=phrase_class.threshold,
- common_terms=phrase_class.common_terms, scorer=scorer)
+ if not hasattr(model, 'corpus_word_count'):
+ logger.warning('older version of %s loaded without corpus_word_count', cls.__name__)
+ logger.warning('setting corpus_word_count to 0, do not use it in your scoring function')
+ model.corpus_word_count = 0
- new_s = []
- for words, score in bigrams:
- if score is not None:
- words = delimiter.join(words)
- new_s.append(words)
- return [utils.to_unicode(w) for w in new_s]
+ # Before 4.0.0, we stored strings as UTF8 bytes internally, to save RAM. Since 4.0.0, we use strings.
+ if getattr(model, 'vocab', None):
+ word = next(iter(model.vocab)) # get a random key – any key will do
+ if not isinstance(word, str):
+ logger.info("old version of %s loaded, upgrading %i words in memory", cls.__name__, len(model.vocab))
+ logger.info("re-save the loaded model to avoid this upgrade in the future")
+ vocab = defaultdict(int)
+ for key, value in model.vocab.items(): # needs lots of extra RAM temporarily!
+ vocab[str(key, encoding='utf8')] = value
+ model.vocab = vocab
+ if not isinstance(model.delimiter, str):
+ model.delimiter = str(model.delimiter, encoding='utf8')
+ return model
-class Phrases(SentenceAnalyzer, PhrasesTransformation):
+class Phrases(_PhrasesTransformation):
"""Detect phrases based on collocation counts."""
def __init__(
self, sentences=None, min_count=5, threshold=10.0,
- max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
+ max_vocab_size=40000000, delimiter='_', progress_per=10000,
scoring='default', common_terms=frozenset(),
):
"""
@@ -330,7 +437,7 @@ def __init__(
to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
`max_vocab_size` depending on how much available memory you have.
delimiter : str, optional
- Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
+ Glue character used to join collocation tokens.
scoring : {'default', 'npmi', function}, optional
Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a
built-in scoring function, or with a function with the expected parameter names.
@@ -359,9 +466,40 @@ def __init__(
* corpus_word_count - the total number of tokens (non-unique) in `sentences`
The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring.
+
The scoring function **must be pickleable**.
+ Examples
+ ----------
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.models.word2vec import Text8Corpus
+ >>> from gensim.models.phrases import Phrases
+ >>>
+ >>> # Load corpus and train a model.
+ >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
+ >>> phrases = Phrases(sentences, min_count=1, threshold=1)
+ >>>
+ >>> # Use the model to detect phrases in a new sentence.
+ >>> sent = [u'trees', u'graph', u'minors']
+ >>> print(phrases[sent])
+ [u'trees_graph', u'minors']
+ >>>
+ >>> # Or transform multiple sentences at once.
+ >>> sents = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
+ >>> for phrase in frozen_phrases[sents]:
+ ... print(phrase)
+ [u'trees_graph', u'minors']
+ [u'graph_minors']
+ >>>
+ >>> # Export a FrozenPhrases object that is more efficient but doesn't allow any more training.
+ >>> frozen_phrases = phrases.freeze()
+ >>> print(frozen_phrases[sent])
+ [u'trees_graph', u'minors']
+
"""
+ super().__init__(common_terms=common_terms)
if min_count <= 0:
raise ValueError("min_count should be at least 1")
@@ -370,10 +508,9 @@ def __init__(
if scoring == 'npmi' and (threshold < -1 or threshold > 1):
raise ValueError("threshold should be between -1 and 1 for npmi scoring")
- # set scoring based on string
- # intentially override the value of the scoring parameter rather than set self.scoring here,
- # to still run the check of scoring function parameters in the next code block
-
+ # Set scoring based on string.
+ # Intentially override the value of the scoring parameter rather than set self.scoring here,
+ # to still run the check of scoring function parameters in the next code block.
if isinstance(scoring, str):
if scoring == 'default':
scoring = original_scorer
@@ -382,65 +519,45 @@ def __init__(
else:
raise ValueError(f'unknown scoring method string {scoring} specified')
- scoring_parameters = [
+ scoring_params = [
'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count',
]
if callable(scoring):
- if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
+ missing = [param for param in scoring_params if param not in getargspec(scoring)[0]]
+ if not missing:
self.scoring = scoring
else:
- raise ValueError('scoring function missing expected parameters')
+ raise ValueError(f'scoring function missing expected parameters {missing}')
self.min_count = min_count
self.threshold = threshold
self.max_vocab_size = max_vocab_size
- self.vocab = defaultdict(int) # mapping between utf8 token => its count
+ self.vocab = defaultdict(int) # mapping between token => its count
self.min_reduce = 1 # ignore any tokens with count smaller than this
self.delimiter = delimiter
self.progress_per = progress_per
self.corpus_word_count = 0
- self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)
- # ensure picklability of custom scorer
+ # Ensure picklability of the scorer.
try:
pickle.loads(pickle.dumps(self.scoring))
except pickle.PickleError:
- raise pickle.PickleError('Custom Phrases scoring function must be pickle-able')
+ raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able')
if sentences is not None:
self.add_vocab(sentences)
- @classmethod
- def load(cls, *args, **kwargs):
- """Load a previously saved Phrases class.
- Handles backwards compatibility from older Phrases versions which did not support pluggable scoring functions.
-
- Parameters
- ----------
- args : object
- Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information.
- kwargs : object
- Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information.
-
- """
- model = super(Phrases, cls).load(*args, **kwargs)
- if not hasattr(model, 'corpus_word_count'):
- logger.info('older version of %s loaded without corpus_word_count', cls.__name__)
- logger.info('Setting it to 0, do not use it in your scoring function.')
- model.corpus_word_count = 0
- return model
-
def __str__(self):
- """Get short string representation of this phrase detector."""
return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % (
self.__class__.__name__, len(self.vocab), self.min_count,
self.threshold, self.max_vocab_size,
)
@staticmethod
- def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
- common_terms=frozenset()):
- """Collect unigram/bigram counts from the `sentences` iterable.
+ def _learn_vocab(
+ sentences, max_vocab_size, delimiter='_', common_terms=frozenset(), progress_per=10000,
+ ):
+ """Collect unigram and bigram counts from the `sentences` iterable.
Parameters
----------
@@ -451,62 +568,40 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
for such examples.
max_vocab_size : int
Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
- to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
+ to keep memory under control. 40M needs about 3.6GB of RAM. Increase/decrease
`max_vocab_size` depending on how much available memory you have.
delimiter : str, optional
- Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
- progress_per : int
- Write logs every `progress_per` sentence.
+ Glue character used to join collocation tokens.
common_terms : set of str, optional
- List of "stop words" that won't affect frequency count of expressions containing them.
- Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".
+ List of "stop words" that won't affect frequency count of phrases containing them.
+ Allow to detect phrases like "bank_of_america" or "eye_of_the_beholder".
+ progress_per : int
+ Log progress once every `progress_per` sentences.
Return
------
(int, dict of (str, int), int)
- Number of pruned words, counters for each word/bi-gram and total number of words.
-
- Example
- ----------
- .. sourcecode:: pycon
-
- >>> from gensim.test.utils import datapath
- >>> from gensim.models.word2vec import Text8Corpus
- >>> from gensim.models.phrases import Phrases
- >>>
- >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
- >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100)
- >>> (pruned_words, total_words)
- (1, 29)
- >>> counters['computer']
- 2
- >>> counters['response_time']
- 1
+ Number of pruned words, counters for each word/bi-gram, and total number of words.
"""
- sentence_no = -1
- total_words = 0
- logger.info("collecting all words and their counts")
+ sentence_no, total_words, min_reduce = -1, 0, 1
vocab = defaultdict(int)
- min_reduce = 1
+ logger.info("collecting all words and their counts")
for sentence_no, sentence in enumerate(sentences):
if sentence_no % progress_per == 0:
logger.info(
"PROGRESS: at sentence #%i, processed %i words and %i word types",
sentence_no, total_words, len(vocab),
)
- s = [utils.any2utf8(w) for w in sentence]
- last_uncommon = None
- in_between = []
- for word in s:
+ start_token, in_between = None, []
+ for word in sentence:
if word not in common_terms:
vocab[word] += 1
- if last_uncommon is not None:
- components = itertools.chain([last_uncommon], in_between, [word])
- vocab[delimiter.join(components)] += 1
- last_uncommon = word
- in_between = []
- elif last_uncommon is not None:
+ if start_token is not None:
+ phrase_tokens = itertools.chain([start_token], in_between, [word])
+ vocab[delimiter.join(phrase_tokens)] += 1
+ start_token, in_between = word, [] # treat word as both end of a phrase AND beginning of another
+ elif start_token is not None:
in_between.append(word)
total_words += 1
@@ -515,13 +610,13 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
min_reduce += 1
logger.info(
- "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
- len(vocab), total_words, sentence_no + 1
+ "collected %i token types (unigram + bigrams) from a corpus of %i words and %i sentences",
+ len(vocab), total_words, sentence_no + 1,
)
return min_reduce, vocab, total_words
def add_vocab(self, sentences):
- """Update model with new `sentences`.
+ """Update model parameters with new `sentences`.
Parameters
----------
@@ -535,7 +630,8 @@ def add_vocab(self, sentences):
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases
- >>> # Create corpus and use it for phrase detector
+ >>>
+ >>> # Train a phrase detector from a text corpus.
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> phrases = Phrases(sentences) # train model
>>> assert len(phrases.vocab) == 37
@@ -549,16 +645,18 @@ def add_vocab(self, sentences):
>>> assert len(phrases.vocab) == 60
"""
- # uses a separate vocab to collect the token counts from `sentences`.
- # this consumes more RAM than merging new sentences into `self.vocab`
+ # Uses a separate vocab to collect the token counts from `sentences`.
+ # This consumes more RAM than merging new sentences into `self.vocab`
# directly, but gives the new sentences a fighting chance to collect
# sufficient counts, before being pruned out by the (large) accumulated
# counts collected in previous learn_vocab runs.
- min_reduce, vocab, total_words = self.learn_vocab(
- sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms)
+ min_reduce, vocab, total_words = self._learn_vocab(
+ sentences, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
+ progress_per=self.progress_per, common_terms=self.common_terms,
+ )
self.corpus_word_count += total_words
- if len(self.vocab) > 0:
+ if self.vocab:
logger.info("merging %i counts into %s", len(vocab), self)
self.min_reduce = max(self.min_reduce, min_reduce)
for word, count in vocab.items():
@@ -566,225 +664,59 @@ def add_vocab(self, sentences):
if len(self.vocab) > self.max_vocab_size:
utils.prune_vocab(self.vocab, self.min_reduce)
self.min_reduce += 1
- logger.info("merged %s", self)
else:
- # in common case, avoid doubling gigantic dict
- logger.info("using %i counts as vocab in %s", len(vocab), self)
+ # Optimization for a common case: the current vocab is empty, so apply
+ # the new vocab directly, no need to double it in memory.
self.vocab = vocab
-
- def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
- """Get all phrases that appear in 'sentences' that pass the bigram threshold.
-
- Parameters
- ----------
- sentences : iterable of list of str
- Text corpus.
- out_delimiter : str, optional
- Delimiter used to "glue" together words that form a bigram phrase.
- as_tuples : bool, optional
- Yield `(tuple(words), score)` instead of `(out_delimiter.join(words), score)`?
-
- Yields
- ------
- ((str, str), float) **or** (str, float)
- Phrases detected in `sentences`. Return type depends on the `as_tuples` parameter.
-
- Example
- -------
- .. sourcecode:: pycon
-
- >>> from gensim.test.utils import datapath
- >>> from gensim.models.word2vec import Text8Corpus
- >>> from gensim.models.phrases import Phrases
- >>>
- >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
- >>> phrases = Phrases(sentences, min_count=1, threshold=0.1)
- >>>
- >>> for phrase, score in phrases.export_phrases(sentences):
- ... pass
-
- """
- analyze_sentence = functools.partial(
- self.analyze_sentence,
- threshold=self.threshold,
- common_terms=self.common_terms,
- scorer=functools.partial(
- self.scoring,
- len_vocab=float(len(self.vocab)),
- min_count=float(self.min_count),
- corpus_word_count=float(self.corpus_word_count),
- ),
+ logger.info("merged %s", self)
+
+ def score_candidate(self, word_a, word_b, in_between):
+ # Micro optimization: check for quick early-out conditions, before the actual scoring.
+ word_a_cnt = self.vocab[word_a]
+ if word_a_cnt <= 0:
+ return None, None
+
+ word_b_cnt = self.vocab[word_b]
+ if word_b_cnt <= 0:
+ return None, None
+
+ phrase = self.delimiter.join([word_a] + in_between + [word_b])
+ # XXX: Why do we care about *all* phrase tokens? Why not just score the start+end bigram?
+ phrase_cnt = self.vocab[phrase]
+ if phrase_cnt <= 0:
+ return None, None
+
+ score = self.scoring(
+ worda_count=word_a_cnt, wordb_count=word_b_cnt, bigram_count=phrase_cnt,
+ len_vocab=len(self.vocab), min_count=self.min_count, corpus_word_count=self.corpus_word_count,
)
- for sentence in sentences:
- bigrams = analyze_sentence(sentence)
- # keeps only not None scores
- filtered = ((words, score) for words, score in bigrams if score is not None)
- for words, score in filtered:
- if as_tuples:
- yield (tuple(words), score)
- else:
- yield (out_delimiter.join(words), score)
+ if score <= self.threshold:
+ return None, None
- def __getitem__(self, sentence):
- """Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter.
+ return phrase, score
- If `sentence` is an entire corpus (iterable of sentences rather than a single
- sentence), return an iterable that converts each of the corpus' sentences
- into phrases on the fly, one after another.
+ def freeze(self):
+ """
+ Return an object that contains the bare minimum of information while still allowing
+ phrase detection. See :class:`~gensim.models.phrases.FrozenPhrases`.
- Parameters
- ----------
- sentence : {list of str, iterable of list of str}
- Sentence or text corpus.
+ Use this "frozen model" to dramatically reduce RAM footprint if you don't plan to
+ make any further changes to your `Phrases` model.
Returns
-------
- {list of str, :class:`gensim.interfaces.TransformedCorpus`}
- `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences
- if the input was a corpus.
-
- Examples
- ----------
- .. sourcecode:: pycon
-
- >>> from gensim.test.utils import datapath
- >>> from gensim.models.word2vec import Text8Corpus
- >>> from gensim.models.phrases import Phrases, Phraser
- >>>
- >>> # Create corpus
- >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
- >>>
- >>> # Train the detector with:
- >>> phrases = Phrases(sentences, min_count=1, threshold=1)
- >>> # Input is a list of unicode strings:
- >>> sent = [u'trees', u'graph', u'minors']
- >>> # Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1:
- >>> print(phrases[sent])
- [u'trees_graph', u'minors']
- >>>
- >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
- >>> phrases = Phrases(sentences, min_count=1, threshold=1)
- >>> phraser = Phraser(phrases) # for speedup
- >>>
- >>> sent = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
- >>> for phrase in phraser[sent]:
- ... pass
+ :class:`~gensim.models.phrases.FrozenPhrases`
+ Exported object that's smaller, faster, but doesn't support model updates.
"""
- return _sentence2token(self, sentence)
-
-
-def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
- r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
- of Words and Phrases and their Compositionality" `_.
+ return FrozenPhrases(self)
- Parameters
- ----------
- worda_count : int
- Number of occurrences for first word.
- wordb_count : int
- Number of occurrences for second word.
- bigram_count : int
- Number of co-occurrences for phrase "worda_wordb".
- len_vocab : int
- Size of vocabulary.
- min_count: int
- Minimum collocation count threshold.
- corpus_word_count : int
- Not used in this particular scoring technique.
-
- Returns
- -------
- float
- Score for given bi-gram, greater than or equal to 0.
-
- Notes
- -----
- Formula: :math:`\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`.
- """
- return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
-
-
-def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
- r"""Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction"
- by Gerlof Bouma `_.
-
- Parameters
- ----------
- worda_count : int
- Number of occurrences for first word.
- wordb_count : int
- Number of occurrences for second word.
- bigram_count : int
- Number of co-occurrences for phrase "worda_wordb".
- len_vocab : int
- Not used.
- min_count: int
- Ignore all bigrams with total collected count lower than this value.
- corpus_word_count : int
- Total number of words in the corpus.
-
- Returns
- -------
- float
- Score for given bi-gram, in the range -1 to 1.
-
- Notes
- -----
- Formula: :math:`\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`,
- where :math:`prob(word) = \frac{word\_count}{corpus\_word\_count}`
-
- """
- if bigram_count >= min_count:
- pa = worda_count / corpus_word_count
- pb = wordb_count / corpus_word_count
- pab = bigram_count / corpus_word_count
- return log(pab / (pa * pb)) / -log(pab)
- else:
- # Return -infinity to make sure that no phrases will be created
- # from bigrams less frequent than min_count
- return float('-inf')
-
-
-def pseudocorpus(source_vocab, sep, common_terms=frozenset()):
- """Feeds `source_vocab`'s compound keys back to it, to discover phrases.
-
- Parameters
- ----------
- source_vocab : iterable of list of str
- Tokens vocabulary.
- sep : str
- Separator element.
- common_terms : set, optional
- Immutable set of stopwords.
-
- Yields
- ------
- list of str
- Phrase.
-
- """
- for k in source_vocab:
- if sep not in k:
- continue
- unigrams = k.split(sep)
- for i in range(1, len(unigrams)):
- if unigrams[i - 1] not in common_terms:
- # do not join common terms
- cterms = list(itertools.takewhile(lambda w: w in common_terms, unigrams[i:]))
- tail = unigrams[i + len(cterms):]
- components = [sep.join(unigrams[:i])] + cterms
- if tail:
- components.append(sep.join(tail))
- yield components
-
-
-class Phraser(SentenceAnalyzer, PhrasesTransformation):
+class FrozenPhrases(_PhrasesTransformation):
"""Minimal state & functionality exported from :class:`~gensim.models.phrases.Phrases`.
The goal of this class is to cut down memory consumption of `Phrases`, by discarding model state
- not strictly needed for the bigram detection task.
+ not strictly needed for the phrase detection task.
Use this instead of `Phrases` if you do not need to update the bigram statistics with new documents any more.
@@ -796,27 +728,28 @@ def __init__(self, phrases_model):
Parameters
----------
phrases_model : :class:`~gensim.models.phrases.Phrases`
- Trained phrases instance.
+ Trained phrases instance, to extract all phrases from.
Notes
-----
- After the one-time initialization, a :class:`~gensim.models.phrases.Phraser` will be much smaller and somewhat
- faster than using the full :class:`~gensim.models.phrases.Phrases` model.
+ After the one-time initialization, a :class:`~gensim.models.phrases.FrozenPhrases` will be much
+ smaller and faster than using the full :class:`~gensim.models.phrases.Phrases` model.
Examples
- --------
+ ----------
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
- >>> from gensim.models.phrases import Phrases, Phraser
+ >>> from gensim.models.phrases import Phrases
>>>
+ >>> # Load corpus and train a model.
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
>>>
- >>> bigram = Phraser(phrases)
- >>> sent = [u'trees', u'graph', u'minors']
- >>> print(bigram[sent])
+ >>> # Export a FrozenPhrases object that is more efficient but doesn't allow further training.
+ >>> frozen_phrases = phrases.freeze()
+ >>> print(frozen_phrases[sent])
[u'trees_graph', u'minors']
"""
@@ -825,99 +758,43 @@ def __init__(self, phrases_model):
self.delimiter = phrases_model.delimiter
self.scoring = phrases_model.scoring
self.common_terms = phrases_model.common_terms
- corpus = self.pseudocorpus(phrases_model)
- self.phrasegrams = {}
- logger.info('source_vocab length %i', len(phrases_model.vocab))
- count = 0
- for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True):
- if bigram in self.phrasegrams:
- logger.info('Phraser repeat %s', bigram)
- self.phrasegrams[bigram] = score
- count += 1
- if not count % 50000:
- logger.info('Phraser added %i phrasegrams', count)
- logger.info('Phraser built with %i phrasegrams', len(self.phrasegrams))
-
- def pseudocorpus(self, phrases_model):
- """Alias for :func:`gensim.models.phrases.pseudocorpus`.
-
- Parameters
- ----------
- phrases_model : :class:`~gensim.models.phrases.Phrases`
- Phrases instance.
-
- Return
- ------
- generator
- Generator with phrases.
-
- """
- return pseudocorpus(phrases_model.vocab, phrases_model.delimiter, phrases_model.common_terms)
+ logger.info('exporting phrases from %s', phrases_model)
+ self.phrasegrams = self._import_phrases(phrases_model)
+ logger.info('exported %s', self)
- def score_item(self, worda, wordb, components, scorer):
- """Score a bigram.
+ def __str__(self):
+ return "%s<%i phrases, min_count=%s, threshold=%s>" % (
+ self.__class__.__name__, len(self.phrasegrams), self.min_count, self.threshold,
+ )
- Parameters
- ----------
- worda : str
- First word for comparison.
- wordb : str
- Second word for comparison.
- components : generator
- Contain phrases.
- scorer : {'default', 'npmi'}
- NOT USED.
+ def _import_phrases(self, phrases_model):
+ """Extract all phrases that pass the threshold out of `phrases_model`.
Returns
- -------
- float
- Score for given bi-gram, if bi-gram not presented in dictionary - return -1.
+ ------
+ dict[str, float]
+ Mapping between phrases and their scores.
"""
- try:
- return self.phrasegrams[tuple(components)]
- except KeyError:
- return -1
-
- def __getitem__(self, sentence):
- """Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent
- tokens are replaced by a single token if they form a bigram collocation.
+ result, source_vocab = {}, phrases_model.vocab
+ for token in source_vocab:
+ unigrams = token.split(self.delimiter)
+ if len(unigrams) < 2:
+ continue # no phrases here
+ phrase, score = phrases_model.score_candidate(unigrams[0], unigrams[-1], unigrams[1:-1])
+ if score is not None:
+ result[phrase] = score
+ return result
- Parameters
- ----------
- sentence : {list of str, iterable of list of str}
- Input sentence or a stream of sentences.
-
- Return
- ------
- {list of str, iterable of list of str}
- Sentence or sentences with phrase tokens joined by `self.delimiter` character.
+ def score_candidate(self, word_a, word_b, in_between):
+ phrase = self.delimiter.join([word_a] + in_between + [word_b])
+ score = self.phrasegrams.get(phrase, NEGATIVE_INFINITY)
+ if score > self.threshold:
+ return phrase, score
+ return None, None
- Examples
- ----------
- .. sourcecode:: pycon
- >>> from gensim.test.utils import datapath
- >>> from gensim.models.word2vec import Text8Corpus
- >>> from gensim.models.phrases import Phrases, Phraser
- >>>
- >>> sentences = Text8Corpus(datapath('testcorpus.txt')) # Read corpus
- >>>
- >>> phrases = Phrases(sentences, min_count=1, threshold=1) # Train model
- >>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase
- >>> phraser_model = Phraser(phrases)
- >>>
- >>> sent = [u'trees', u'graph', u'minors']
- >>> print(phraser_model[sent])
- [u'trees_graph', u'minors']
- >>> sent = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
- >>> for phrase in phraser_model[sent]:
- ... print(phrase)
- [u'trees_graph', u'minors']
- [u'graph_minors']
-
- """
- return _sentence2token(self, sentence)
+Phraser = FrozenPhrases # alias for backward compatibility
if __name__ == '__main__':
@@ -935,7 +812,6 @@ def __getitem__(self, sentence):
from gensim.models.word2vec import Text8Corpus
sentences = Text8Corpus(infile)
- # test_doc = LineSentence('test/test_data/testcorpus.txt')
bigram = Phrases(sentences, min_count=5, threshold=100)
for s in bigram[sentences]:
- print(utils.to_utf8(u' '.join(s)))
+ print(u' '.join(s))
diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py
index 1570acf224..4c04292473 100644
--- a/gensim/sklearn_api/phrases.py
+++ b/gensim/sklearn_api/phrases.py
@@ -27,12 +27,12 @@
>>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0]
"""
-from six import string_types
+
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
-from gensim.models.phrases import Phraser
+from gensim.models.phrases import FrozenPhrases
class PhrasesTransformer(TransformerMixin, BaseEstimator):
@@ -44,8 +44,10 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator):
`_.
"""
- def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
- delimiter=b'_', progress_per=10000, scoring='default', common_terms=frozenset()):
+ def __init__(
+ self, min_count=5, threshold=10.0, max_vocab_size=40000000,
+ delimiter='_', progress_per=10000, scoring='default', common_terms=frozenset(),
+ ):
"""
Parameters
@@ -58,7 +60,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control.
The default of 40M needs about 3.6GB of RAM.
delimiter : str, optional
- Character used to join collocation tokens, should be a byte string (e.g. b'_').
+ Character used to join collocation tokens (e.g. '_').
progress_per : int, optional
Training will report to the logger every that many phrases are learned.
scoring : str or function, optional
@@ -127,7 +129,7 @@ def fit(self, X, y=None):
max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms
)
- self.phraser = Phraser(self.gensim_model)
+ self.phraser = FrozenPhrases(self.gensim_model)
return self
def transform(self, docs):
@@ -152,10 +154,10 @@ def transform(self, docs):
)
if self.phraser is None:
- self.phraser = Phraser(self.gensim_model)
+ self.phraser = FrozenPhrases(self.gensim_model)
# input as python lists
- if isinstance(docs[0], string_types):
+ if isinstance(docs[0], str):
docs = [docs]
return [self.phraser[doc] for doc in docs]
@@ -186,5 +188,5 @@ def partial_fit(self, X):
)
self.gensim_model.add_vocab(X)
- self.phraser = Phraser(self.gensim_model)
+ self.phraser = FrozenPhrases(self.gensim_model)
return self
diff --git a/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl
deleted file mode 100644
index 7799418058..0000000000
Binary files a/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl and /dev/null differ
diff --git a/gensim/test/test_data/phrases-transformer-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-v3-5-0.pkl
deleted file mode 100644
index 8ffef6763b..0000000000
Binary files a/gensim/test/test_data/phrases-transformer-v3-5-0.pkl and /dev/null differ
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index ed85fea2b5..9c7a73cae4 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -4,140 +4,97 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
-Automated tests for checking transformation algorithms (the models package).
+Automated tests for the phrase detection module.
"""
-
import logging
import unittest
-import six
import numpy as np
-from gensim.utils import to_unicode
-from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser
-from gensim.models.phrases import pseudocorpus, original_scorer
+from gensim.models.phrases import Phrases, FrozenPhrases, _PhrasesTransformation
+from gensim.models.phrases import original_scorer
from gensim.test.utils import common_texts, temporary_file, datapath
-class TestUtils(unittest.TestCase):
-
- def test_pseudocorpus_no_common_terms(self):
- vocab = [
- "prime_minister",
- "gold",
- "chief_technical_officer",
- "effective"]
- result = list(pseudocorpus(vocab, "_"))
- self.assertEqual(
- result,
- [["prime", "minister"],
- ["chief", "technical_officer"],
- ["chief_technical", "officer"]])
-
- def test_pseudocorpus_with_common_terms(self):
- vocab = [
- "hall_of_fame",
- "gold",
- "chief_of_political_bureau",
- "effective",
- "beware_of_the_dog_in_the_yard"]
- common_terms = frozenset(["in", "the", "of"])
- result = list(pseudocorpus(vocab, "_", common_terms=common_terms))
- self.assertEqual(
- result,
- [["hall", "of", "fame"],
- ["chief", "of", "political_bureau"],
- ["chief_of_political", "bureau"],
- ["beware", "of", "the", "dog_in_the_yard"],
- ["beware_of_the_dog", "in", "the", "yard"]])
-
-
class TestPhraseAnalysis(unittest.TestCase):
- class AnalysisTester(SentenceAnalyzer):
+ class AnalysisTester(_PhrasesTransformation):
- def __init__(self, scores):
+ def __init__(self, scores, threshold):
+ super().__init__(common_terms={"a", "the", "with", "of"})
self.scores = scores
+ self.threshold = threshold
- def score_item(self, worda, wordb, components, scorer):
- """Override for test purpose"""
- if worda is not None and wordb is not None:
- bigram_word = b"_".join(components)
- return self.scores.get(bigram_word, -1)
- else:
- return -1
-
- def analyze(self, scores, sentence):
- analyzer = self.AnalysisTester(scores)
- return list(analyzer.analyze_sentence(
- sentence,
- threshold=1,
- common_terms={b"a", b"the", b"with", b"of"},
- scorer=None))
-
- def analyze_words(self, scores, sentence):
- result = (
- w if isinstance(w, (tuple, list)) else [w]
- for w, score in self.analyze(scores, sentence))
- return [b"_".join(w).decode("utf-8") for w in result]
+ def score_candidate(self, word_a, word_b, in_between):
+ phrase = "_".join([word_a] + in_between + [word_b])
+ score = self.scores.get(phrase, -1)
+ if score > self.threshold:
+ return phrase, score
+ return None, None
def test_simple_analysis(self):
- s = ["simple", "sentence", "should", "pass"]
- result = self.analyze_words({}, s)
- self.assertEqual(result, s)
- s = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"]
- result = self.analyze_words({}, s)
- self.assertEqual(result, s)
+ """Test transformation with no phrases."""
+ sentence = ["simple", "sentence", "should", "pass"]
+ result = self.AnalysisTester({}, threshold=1)[sentence]
+ self.assertEqual(result, sentence)
+ sentence = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"]
+ result = self.AnalysisTester({}, threshold=1)[sentence]
+ self.assertEqual(result, sentence)
def test_analysis_bigrams(self):
scores = {
- b"simple_sentence": 2, b"sentence_many": 2,
- b"many_possible": 2, b"possible_bigrams": 2}
- s = ["simple", "sentence", "many", "possible", "bigrams"]
- result = self.analyze_words(scores, s)
+ "simple_sentence": 2, "sentence_many": 2,
+ "many_possible": 2, "possible_bigrams": 2,
+ }
+ sentence = ["simple", "sentence", "many", "possible", "bigrams"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"])
- s = ["some", "simple", "sentence", "many", "bigrams"]
- result = self.analyze_words(scores, s)
+ sentence = ["some", "simple", "sentence", "many", "bigrams"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"])
- s = ["some", "unrelated", "simple", "words"]
- result = self.analyze_words(scores, s)
- self.assertEqual(result, s)
+ sentence = ["some", "unrelated", "simple", "words"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
+ self.assertEqual(result, sentence)
def test_analysis_common_terms(self):
scores = {
- b"simple_sentence": 2, b"sentence_many": 2,
- b"many_possible": 2, b"possible_bigrams": 2}
- s = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"]
- result = self.analyze_words(scores, s)
+ "simple_sentence": 2, "sentence_many": 2,
+ "many_possible": 2, "possible_bigrams": 2,
+ }
+ sentence = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"])
- s = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"]
- result = self.analyze_words(scores, s)
- self.assertEqual(result, [
- "simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"])
+ sentence = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
+ self.assertEqual(
+ result,
+ ["simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"],
+ )
def test_analysis_common_terms_in_between(self):
scores = {
- b"simple_sentence": 2, b"sentence_with_many": 2,
- b"many_possible": 2, b"many_of_the_possible": 2, b"possible_bigrams": 2}
- s = ["sentence", "with", "many", "possible", "bigrams"]
- result = self.analyze_words(scores, s)
+ "simple_sentence": 2, "sentence_with_many": 2,
+ "many_possible": 2, "many_of_the_possible": 2, "possible_bigrams": 2,
+ }
+ sentence = ["sentence", "with", "many", "possible", "bigrams"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["sentence_with_many", "possible_bigrams"])
- s = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"]
- result = self.analyze_words(scores, s)
+ sentence = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"]
+ result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(
result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"])
class PhrasesData:
+
sentences = common_texts + [
- ['graph', 'minors', 'survey', 'human', 'interface']
+ ['graph', 'minors', 'survey', 'human', 'interface'],
]
- unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences]
common_terms = frozenset()
bigram1 = u'response_time'
@@ -148,24 +105,18 @@ def gen_sentences(self):
return ((w for w in sentence) for sentence in self.sentences)
-class PhrasesCommon:
- """ Tests that need to be run for both Phrases and Phraser classes."""
+class PhrasesCommon(PhrasesData):
+ """Tests for both Phrases and FrozenPhrases classes."""
def setUp(self):
- self.bigram = Phrases(
- self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
- self.bigram_default = Phrases(
- self.sentences, common_terms=self.common_terms)
- self.bigram_utf8 = Phrases(
- self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
- self.bigram_unicode = Phrases(
- self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms)
+ self.bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
+ self.bigram_default = Phrases(self.sentences, common_terms=self.common_terms)
def testEmptyPhrasifiedSentencesIterator(self):
bigram_phrases = Phrases(self.sentences)
- bigram_phraser = Phraser(bigram_phrases)
+ bigram_phraser = FrozenPhrases(bigram_phrases)
trigram_phrases = Phrases(bigram_phraser[self.sentences])
- trigram_phraser = Phraser(trigram_phrases)
+ trigram_phraser = FrozenPhrases(trigram_phrases)
trigrams = trigram_phraser[bigram_phraser[self.sentences]]
fst, snd = list(trigrams), list(trigrams)
self.assertEqual(fst, snd)
@@ -187,22 +138,27 @@ def testEmptyInputsOnBigramConstruction(self):
def testSentenceGeneration(self):
"""Test basic bigram using a dummy corpus."""
# test that we generate the same amount of sentences as the input
- self.assertEqual(len(self.sentences), len(list(self.bigram_default[self.sentences])))
+ self.assertEqual(
+ len(self.sentences),
+ len(list(self.bigram_default[self.sentences])),
+ )
def testSentenceGenerationWithGenerator(self):
"""Test basic bigram production when corpus is a generator."""
- self.assertEqual(len(list(self.gen_sentences())),
- len(list(self.bigram_default[self.gen_sentences()])))
+ self.assertEqual(
+ len(list(self.gen_sentences())),
+ len(list(self.bigram_default[self.gen_sentences()])),
+ )
def testBigramConstruction(self):
- """Test Phrases bigram construction building."""
+ """Test Phrases bigram construction."""
# with this setting we should get response_time and graph_minors
bigram1_seen = False
bigram2_seen = False
- for s in self.bigram[self.sentences]:
- if not bigram1_seen and self.bigram1 in s:
+ for sentence in self.bigram[self.sentences]:
+ if not bigram1_seen and self.bigram1 in sentence:
bigram1_seen = True
- if not bigram2_seen and self.bigram2 in s:
+ if not bigram2_seen and self.bigram2 in sentence:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
@@ -218,7 +174,7 @@ def testBigramConstruction(self):
self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]])
def testBigramConstructionFromGenerator(self):
- """Test Phrases bigram construction building when corpus is a generator"""
+ """Test Phrases bigram construction building when corpus is a generator."""
bigram1_seen = False
bigram2_seen = False
@@ -232,7 +188,7 @@ def testBigramConstructionFromGenerator(self):
self.assertTrue(bigram1_seen and bigram2_seen)
def testBigramConstructionFromArray(self):
- """Test Phrases bigram construction building when corpus is a numpy array"""
+ """Test Phrases bigram construction building when corpus is a numpy array."""
bigram1_seen = False
bigram2_seen = False
@@ -245,16 +201,6 @@ def testBigramConstructionFromArray(self):
break
self.assertTrue(bigram1_seen and bigram2_seen)
- def testEncoding(self):
- """Test that both utf8 and unicode input work; output must be unicode."""
- expected = [u'survey', u'user', u'computer', u'system', u'response_time']
-
- self.assertEqual(self.bigram_utf8[self.sentences[1]], expected)
- self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)
-
- transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
- self.assertTrue(isinstance(transformed, six.text_type))
-
# scorer for testCustomScorer
# function is outside of the scope of the test because for picklability of custom scorer
@@ -264,43 +210,32 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
return 1
-class TestPhrasesModel(PhrasesData, PhrasesCommon, unittest.TestCase):
+class TestPhrasesModel(PhrasesCommon, unittest.TestCase):
def testExportPhrases(self):
- """Test Phrases bigram export_phrases functionality."""
- bigram = Phrases(self.sentences, min_count=1, threshold=1)
-
- seen_bigrams = set()
-
- for phrase, score in bigram.export_phrases(self.sentences):
- seen_bigrams.add(phrase)
+ """Test Phrases bigram export phrases."""
+ bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
+ seen_bigrams = set(bigram.export_phrases(self.sentences).keys())
assert seen_bigrams == {
- b'response time',
- b'graph minors',
- b'human interface',
+ 'response time',
+ 'graph minors',
+ 'human interface',
}
def testMultipleBigramsSingleEntry(self):
- """ a single entry should produce multiple bigrams. """
- bigram = Phrases(self.sentences, min_count=1, threshold=1)
- seen_bigrams = set()
-
+ """Test a single entry produces multiple bigrams."""
+ bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_bigrams.add(phrase)
+ seen_bigrams = set(bigram.export_phrases(test_sentences).keys())
- assert seen_bigrams == {b'graph minors', b'human interface'}
+ assert seen_bigrams == {'graph minors', 'human interface'}
def testScoringDefault(self):
- """ test the default scoring, from the mikolov word2vec paper """
- bigram = Phrases(self.sentences, min_count=1, threshold=1)
-
- seen_scores = set()
-
+ """Test the default scoring, from the mikolov word2vec paper."""
+ bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values())
assert seen_scores == {
5.167, # score for graph minors
@@ -308,22 +243,18 @@ def testScoringDefault(self):
}
def test__getitem__(self):
- """ test Phrases[sentences] with a single sentence"""
+ """Test Phrases[sentences] with a single sentence."""
bigram = Phrases(self.sentences, min_count=1, threshold=1)
- # pdb.set_trace()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
phrased_sentence = next(bigram[test_sentences].__iter__())
assert phrased_sentence == ['graph_minors', 'survey', 'human_interface']
def testScoringNpmi(self):
- """ test normalized pointwise mutual information scoring """
+ """Test normalized pointwise mutual information scoring."""
bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi')
-
- seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values())
assert seen_scores == {
.882, # score for graph minors
@@ -331,16 +262,12 @@ def testScoringNpmi(self):
}
def testCustomScorer(self):
- """ test using a custom scoring function """
-
+ """Test using a custom scoring function."""
bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
-
- seen_scores = []
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_scores.append(score)
+ seen_scores = list(bigram.export_phrases(test_sentences).values())
- assert all(seen_scores) # all scores 1
+ assert all(score == 1 for score in seen_scores)
assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testBadParameters(self):
@@ -361,31 +288,25 @@ def testPruning(self):
class TestPhrasesPersistence(PhrasesData, unittest.TestCase):
def testSaveLoadCustomScorer(self):
- """ saving and loading a Phrases object with a custom scorer """
-
+ """Test saving and loading a Phrases object with a custom scorer."""
with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
- seen_scores = []
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
- for phrase, score in bigram_loaded.export_phrases(test_sentences):
- seen_scores.append(score)
+ seen_scores = list(bigram_loaded.export_phrases(test_sentences).values())
- assert all(seen_scores) # all scores 1
+ assert all(score == 1 for score in seen_scores)
assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testSaveLoad(self):
- """ Saving and loading a Phrases object."""
-
+ """Test saving and loading a Phrases object."""
with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=1)
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
- seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
- for phrase, score in bigram_loaded.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram_loaded.export_phrases(test_sentences).values())
assert seen_scores == set([
5.167, # score for graph minors
@@ -393,13 +314,10 @@ def testSaveLoad(self):
])
def testSaveLoadStringScoring(self):
- """ Saving and loading a Phrases object with a string scoring parameter.
- This should ensure backwards compatibility with the previous version of Phrases"""
+ """Test backwards compatibility with a previous version of Phrases with custom scoring."""
bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
- seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
- for phrase, score in bigram_loaded.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram_loaded.export_phrases(test_sentences).values())
assert seen_scores == set([
5.167, # score for graph minors
@@ -407,14 +325,10 @@ def testSaveLoadStringScoring(self):
])
def testSaveLoadNoScoring(self):
- """ Saving and loading a Phrases object with no scoring parameter.
- This should ensure backwards compatibility with old versions of Phrases"""
-
+ """Test backwards compatibility with old versions of Phrases with no scoring parameter."""
bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl"))
- seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
- for phrase, score in bigram_loaded.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram_loaded.export_phrases(test_sentences).values())
assert seen_scores == set([
5.167, # score for graph minors
@@ -426,77 +340,67 @@ def testSaveLoadNoCommonTerms(self):
bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
self.assertEqual(bigram_loaded.common_terms, frozenset())
# can make a phraser, cf #1751
- phraser = Phraser(bigram_loaded) # does not raise
+ phraser = FrozenPhrases(bigram_loaded) # does not raise
phraser[["human", "interface", "survey"]] # does not raise
-class TestPhraserPersistence(PhrasesData, unittest.TestCase):
+class TestFrozenPhrasesPersistence(PhrasesData, unittest.TestCase):
def testSaveLoadCustomScorer(self):
- """Saving and loading a Phraser object with a custom scorer """
+ """Test saving and loading a FrozenPhrases object with a custom scorer."""
with temporary_file("test.pkl") as fpath:
- bigram = Phraser(
+ bigram = FrozenPhrases(
Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
bigram.save(fpath)
- bigram_loaded = Phraser.load(fpath)
- # we do not much with scoring, just verify its the one expected
+ bigram_loaded = FrozenPhrases.load(fpath)
self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def testSaveLoad(self):
- """ Saving and loading a Phraser object."""
+ """Test saving and loading a FrozenPhrases object."""
with temporary_file("test.pkl") as fpath:
- bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
+ bigram = FrozenPhrases(Phrases(self.sentences, min_count=1, threshold=1))
bigram.save(fpath)
- bigram_loaded = Phraser.load(fpath)
+ bigram_loaded = FrozenPhrases.load(fpath)
self.assertEqual(
bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
['graph_minors', 'survey', 'human_interface', 'system'])
def testSaveLoadStringScoring(self):
- """ Saving and loading a Phraser object with a string scoring parameter.
- This should ensure backwards compatibility with the previous version of Phraser"""
- bigram_loaded = Phraser.load(datapath("phraser-scoring-str.pkl"))
+ """Test saving and loading a FrozenPhrases object with a string scoring parameter.
+ This should ensure backwards compatibility with the previous version of FrozenPhrases"""
+ bigram_loaded = FrozenPhrases.load(datapath("phraser-scoring-str.pkl"))
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, original_scorer)
def testSaveLoadNoScoring(self):
- """ Saving and loading a Phraser object with no scoring parameter.
- This should ensure backwards compatibility with old versions of Phraser"""
- bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl"))
+ """Test saving and loading a FrozenPhrases object with no scoring parameter.
+ This should ensure backwards compatibility with old versions of FrozenPhrases"""
+ bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl"))
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, original_scorer)
def testSaveLoadNoCommonTerms(self):
- """ Ensure backwards compatibility with old versions of Phraser, before common_terms"""
- bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl"))
+ """Ensure backwards compatibility with old versions of FrozenPhrases, before common_terms."""
+ bigram_loaded = FrozenPhrases.load(datapath("phraser-no-common-terms.pkl"))
self.assertEqual(bigram_loaded.common_terms, frozenset())
-class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase):
- """ Test Phraser models."""
+class TestFrozenPhrasesModel(PhrasesCommon, unittest.TestCase):
+ """Test FrozenPhrases models."""
def setUp(self):
- """Set up Phraser models for the tests."""
+ """Set up FrozenPhrases models for the tests."""
bigram_phrases = Phrases(
self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
- self.bigram = Phraser(bigram_phrases)
+ self.bigram = FrozenPhrases(bigram_phrases)
bigram_default_phrases = Phrases(self.sentences, common_terms=self.common_terms)
- self.bigram_default = Phraser(bigram_default_phrases)
-
- bigram_utf8_phrases = Phrases(
- self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
- self.bigram_utf8 = Phraser(bigram_utf8_phrases)
-
- bigram_unicode_phrases = Phrases(
- self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms)
- self.bigram_unicode = Phraser(bigram_unicode_phrases)
+ self.bigram_default = FrozenPhrases(bigram_default_phrases)
class CommonTermsPhrasesData:
- """This mixin permits to reuse the test, using, this time the common_terms option
- """
+ """This mixin permits to reuse tests with the common_terms option."""
sentences = [
['human', 'interface', 'with', 'computer'],
@@ -510,7 +414,6 @@ class CommonTermsPhrasesData:
['data', 'and', 'graph', 'survey'],
['data', 'and', 'graph', 'survey', 'for', 'human', 'interface'] # test bigrams within same sentence
]
- unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences]
common_terms = ['of', 'and', 'for']
bigram1 = u'lack_of_interest'
@@ -527,63 +430,43 @@ def gen_sentences(self):
class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel):
"""Test Phrases models with common terms"""
- def testEncoding(self):
- """Test that both utf8 and unicode input work; output must be unicode."""
- expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest']
-
- self.assertEqual(self.bigram_utf8[self.sentences[1]], expected)
- self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)
-
- transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
- self.assertTrue(isinstance(transformed, six.text_type))
-
def testMultipleBigramsSingleEntry(self):
- """ a single entry should produce multiple bigrams. """
- bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
-
- seen_bigrams = set()
+ """Test a single entry produces multiple bigrams."""
+ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms, delimiter=' ')
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_bigrams.add(phrase)
+ seen_bigrams = set(bigram.export_phrases(test_sentences).keys())
+
assert seen_bigrams == set([
- b'data and graph',
- b'human interface',
+ 'data and graph',
+ 'human interface',
])
def testExportPhrases(self):
- """Test Phrases bigram export_phrases functionality."""
- bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
-
- seen_bigrams = set()
-
- for phrase, score in bigram.export_phrases(self.sentences):
- seen_bigrams.add(phrase)
+ """Test Phrases bigram export phrases."""
+ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms, delimiter=' ')
+ seen_bigrams = set(bigram.export_phrases(self.sentences).keys())
assert seen_bigrams == set([
- b'human interface',
- b'graph of trees',
- b'data and graph',
- b'lack of interest',
+ 'human interface',
+ 'graph of trees',
+ 'data and graph',
+ 'lack of interest',
])
def testScoringDefault(self):
""" test the default scoring, from the mikolov word2vec paper """
bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
-
- seen_scores = set()
-
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values())
min_count = float(bigram.min_count)
len_vocab = float(len(bigram.vocab))
- graph = float(bigram.vocab[b"graph"])
- data = float(bigram.vocab[b"data"])
- data_and_graph = float(bigram.vocab[b"data_and_graph"])
- human = float(bigram.vocab[b"human"])
- interface = float(bigram.vocab[b"interface"])
- human_interface = float(bigram.vocab[b"human_interface"])
+ graph = float(bigram.vocab["graph"])
+ data = float(bigram.vocab["data"])
+ data_and_graph = float(bigram.vocab["data_and_graph"])
+ human = float(bigram.vocab["human"])
+ interface = float(bigram.vocab["interface"])
+ human_interface = float(bigram.vocab["human_interface"])
assert seen_scores == set([
# score for data and graph
@@ -593,15 +476,13 @@ def testScoringDefault(self):
])
def testScoringNpmi(self):
- """ test normalized pointwise mutual information scoring """
- bigram = Phrases(self.sentences, min_count=1, threshold=.5,
- scoring='npmi', common_terms=self.common_terms)
-
- seen_scores = set()
-
+ """Test normalized pointwise mutual information scoring."""
+ bigram = Phrases(
+ self.sentences, min_count=1, threshold=.5,
+ scoring='npmi', common_terms=self.common_terms,
+ )
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_scores.add(round(score, 3))
+ seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values())
assert seen_scores == set([
.74, # score for data and graph
@@ -609,56 +490,35 @@ def testScoringNpmi(self):
])
def testCustomScorer(self):
- """ test using a custom scoring function """
-
- bigram = Phrases(self.sentences, min_count=1, threshold=.001,
- scoring=dumb_scorer, common_terms=self.common_terms)
-
- seen_scores = []
+ """Test using a custom scoring function."""
+ bigram = Phrases(
+ self.sentences, min_count=1, threshold=.001,
+ scoring=dumb_scorer, common_terms=self.common_terms,
+ )
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
- for phrase, score in bigram.export_phrases(test_sentences):
- seen_scores.append(score)
+ seen_scores = list(bigram.export_phrases(test_sentences).values())
assert all(seen_scores) # all scores 1
assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def test__getitem__(self):
- """ test Phrases[sentences] with a single sentence"""
+ """Test Phrases[sentences] with a single sentence."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)
- # pdb.set_trace()
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
phrased_sentence = next(bigram[test_sentences].__iter__())
assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface']
-class TestPhraserModelCommonTerms(CommonTermsPhrasesData, TestPhraserModel):
-
- def testEncoding(self):
- """Test that both utf8 and unicode input work; output must be unicode."""
- expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest']
-
- self.assertEqual(self.bigram_utf8[self.sentences[1]], expected)
- self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)
-
- transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
- self.assertTrue(isinstance(transformed, six.text_type))
-
-
-class TestPhraserModelCompatibilty(unittest.TestCase):
+class TestFrozenPhrasesModelCompatibilty(unittest.TestCase):
def testCompatibilty(self):
- phr = Phraser.load(datapath("phraser-3.6.0.model"))
- model = Phrases.load(datapath("phrases-3.6.0.model"))
-
+ phrases = Phrases.load(datapath("phrases-3.6.0.model"))
+ phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model"))
test_sentences = ['trees', 'graph', 'minors']
- expected_res = ['trees', 'graph_minors']
-
- phr_out = phr[test_sentences]
- model_out = model[test_sentences]
- self.assertEqual(phr_out, expected_res)
- self.assertEqual(model_out, expected_res)
+ self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors'])
+ self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
if __name__ == '__main__':
diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 9dc7d303eb..b6b9449eb4 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -1137,7 +1137,7 @@ def testPartialFit(self):
new_sentences = [
['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'],
['world', 'peace', 'people'],
- ['world', 'peace', 'humans']
+ ['world', 'peace', 'humans'],
]
self.model.partial_fit(X=new_sentences) # train model with new sentences
@@ -1182,30 +1182,6 @@ def setUp(self):
[u'the', u'bank_of_america', u'offices', u'are', u'closed']
]
- def testCompareToOld(self):
- with open(datapath("phrases-transformer-v3-5-0.pkl"), "rb") as old_phrases_transformer_pkl:
- old_phrases_transformer = pickle.load(old_phrases_transformer_pkl)
- doc = phrases_sentences[-1]
- phrase_tokens = old_phrases_transformer.transform(doc)[0]
- expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface']
- self.assertEqual(phrase_tokens, expected_phrase_tokens)
-
- self.model.fit(phrases_sentences)
- new_phrase_tokens = self.model.transform(doc)[0]
- self.assertEqual(new_phrase_tokens, phrase_tokens)
-
- def testLoadNew(self):
- with open(datapath("phrases-transformer-new-v3-5-0.pkl"), "rb") as new_phrases_transformer_pkl:
- old_phrases_transformer = pickle.load(new_phrases_transformer_pkl)
- doc = phrases_sentences[-1]
- phrase_tokens = old_phrases_transformer.transform(doc)[0]
- expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface']
- self.assertEqual(phrase_tokens, expected_phrase_tokens)
-
- self.model.fit(phrases_sentences)
- new_phrase_tokens = self.model.transform(doc)[0]
- self.assertEqual(new_phrase_tokens, phrase_tokens)
-
def testFitAndTransform(self):
self.model.fit(phrases_w_common_terms)
@@ -1247,10 +1223,7 @@ def testPartialFit(self):
self.assertEqual(transformed_2, expected_transformations_2)
-# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter
-
-# this is intentionally in main rather than a class method to support pickling
-# all scores will be 1
+# For testing pluggable scoring in Phrases – must remain pickleable.
def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
return 1