diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index cbe06b6fb5..ca3c1ec019 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a .. raw:: html -
+
@@ -33,9 +33,10 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png + :alt: Core Concepts - :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` .. raw:: html @@ -53,9 +54,10 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png + :alt: Corpora and Vector Spaces - :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` .. raw:: html @@ -73,9 +75,10 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png + :alt: Topics and Transformations - :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` .. raw:: html @@ -93,9 +96,10 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png + :alt: Similarity Queries - :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` .. raw:: html @@ -108,7 +112,7 @@ Understanding this functionality is vital for using gensim effectively. /auto_examples/core/run_similarity_queries .. raw:: html -
+
@@ -127,9 +131,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png + :alt: Word2Vec Model - :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` .. raw:: html @@ -147,9 +152,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png + :alt: Doc2Vec Model - :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` .. raw:: html @@ -167,9 +173,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -187,9 +194,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + :alt: Fast Similarity Queries with Annoy and Word2Vec - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -207,9 +215,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + :alt: LDA Model - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -227,9 +236,10 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -242,7 +252,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod /auto_examples/tutorials/run_wmd .. raw:: html -
+
@@ -261,9 +271,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png + :alt: How to download pre-trained models and corpora - :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` .. raw:: html @@ -281,9 +292,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png + :alt: How to Author Gensim Documentation - :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` .. raw:: html @@ -301,9 +313,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png + :alt: How to reproduce the doc2vec 'Paragraph Vector' paper - :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` .. raw:: html @@ -321,9 +334,10 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png + :alt: How to Compare LDA Models - :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` .. raw:: html @@ -336,7 +350,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u /auto_examples/howtos/run_compare_lda .. raw:: html -
+
@@ -379,7 +393,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. raw:: html -
+
@@ -389,15 +403,15 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from :class: sphx-glr-footer-gallery - .. container:: sphx-glr-download + .. container:: sphx-glr-download sphx-glr-download-python - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` - .. container:: sphx-glr-download + .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9460619db8..959604a4fc 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -11,7 +11,7 @@ * `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" `_ -* `"Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma +* `"Normalized (Pointwise) Mutual Information in Collocation Extraction" by Gerlof Bouma `_ @@ -21,39 +21,42 @@ >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases, Phraser + >>> from gensim.models.phrases import Phrases >>> - >>> # Load training data. + >>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator). >>> sentences = Text8Corpus(datapath('testcorpus.txt')) - >>> # The training corpus must be a sequence (stream, generator) of sentences, - >>> # with each sentence a list of tokens: - >>> print(list(sentences)[0][:10]) + >>> # Each sentence must be a list of string tokens: + >>> first_sentence = next(iter(sentences)) + >>> print(first_sentence[:10]) ['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface'] >>> - >>> # Train a toy bigram model. - >>> phrases = Phrases(sentences, min_count=1, threshold=1) + >>> # Train a toy phrase model on our training corpus. + >>> phrase_model = Phrases(sentences, delimiter='_', min_count=1, threshold=1) + >>> >>> # Apply the trained phrases model to a new, unseen sentence. - >>> phrases[['trees', 'graph', 'minors']] + >>> new_sentence = ['trees', 'graph', 'minors'] + >>> phrase_model[new_sentence] ['trees_graph', 'minors'] >>> # The toy model considered "trees graph" a single phrase => joined the two - >>> # tokens into a single token, `trees_graph`. + >>> # tokens into a single "phrase" token, using our selected `_` delimiter. + >>> + >>> # Apply the trained model to each sentence of a corpus, using the same [] syntax: + >>> for sent in phrase_model[sentences]: + ... pass >>> >>> # Update the model with two new sentences on the fly. - >>> phrases.add_vocab([["hello", "world"], ["meow"]]) + >>> phrase_model.add_vocab([["hello", "world"], ["meow"]]) >>> >>> # Export the trained model = use less RAM, faster processing. Model updates no longer possible. - >>> bigram = Phraser(phrases) - >>> bigram[['trees', 'graph', 'minors']] # apply the exported model to a sentence + >>> frozen_model = phrase_model.freeze() + >>> # Apply the frozen model; same results as before: + >>> frozen_model[new_sentence] ['trees_graph', 'minors'] >>> - >>> # Apply the exported model to each sentence of a corpus: - >>> for sent in bigram[sentences]: - ... pass - >>> - >>> # Save / load an exported collocation model. - >>> bigram.save("/tmp/my_bigram_model.pkl") - >>> bigram_reloaded = Phraser.load("/tmp/my_bigram_model.pkl") - >>> bigram_reloaded[['trees', 'graph', 'minors']] # apply the exported model to a sentence + >>> # Save / load models. + >>> frozen_model.save("/tmp/my_phrase_model.pkl") + >>> model_reloaded = Phrases.load("/tmp/my_phrase_model.pkl") + >>> model_reloaded[['trees', 'graph', 'minors']] # apply the reloaded model to a sentence ['trees_graph', 'minors'] """ @@ -62,7 +65,6 @@ import os import logging from collections import defaultdict -import functools import itertools from math import log import pickle @@ -73,6 +75,89 @@ logger = logging.getLogger(__name__) +NEGATIVE_INFINITY = float('-inf') + + +def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): + r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations + of Words and Phrases and their Compositionality" `_. + + Parameters + ---------- + worda_count : int + Number of occurrences for first word. + wordb_count : int + Number of occurrences for second word. + bigram_count : int + Number of co-occurrences for phrase "worda_wordb". + len_vocab : int + Size of vocabulary. + min_count: int + Minimum collocation count threshold. + corpus_word_count : int + Not used in this particular scoring technique. + + Returns + ------- + float + Score for given bi-gram, greater than or equal to 0. + + Notes + ----- + Formula: :math:`\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`. + + """ + denom = worda_count * wordb_count + if denom == 0: + return NEGATIVE_INFINITY + return (bigram_count - min_count) / float(denom) * len_vocab + + +def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): + r"""Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction" + by Gerlof Bouma `_. + + Parameters + ---------- + worda_count : int + Number of occurrences for first word. + wordb_count : int + Number of occurrences for second word. + bigram_count : int + Number of co-occurrences for phrase "worda_wordb". + len_vocab : int + Not used. + min_count: int + Ignore all bigrams with total collected count lower than this value. + corpus_word_count : int + Total number of words in the corpus. + + Returns + ------- + float + If bigram_count >= min_count, return the collocation score, in the range -1 to 1. + Otherwise return -inf. + + Notes + ----- + Formula: :math:`\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`, + where :math:`prob(word) = \frac{word\_count}{corpus\_word\_count}` + + """ + if bigram_count >= min_count: + corpus_word_count = float(corpus_word_count) + pa = worda_count / corpus_word_count + pb = wordb_count / corpus_word_count + pab = bigram_count / corpus_word_count + try: + return log(pab / (pa * pb)) / -log(pab) + except ValueError: # some of the counts were zero => never a phrase + return NEGATIVE_INFINITY + else: + # Return -infinity to make sure that no phrases will be created + # from bigrams less frequent than min_count. + return NEGATIVE_INFINITY + def _is_single(obj): """Check whether `obj` is a single document or an entire corpus. @@ -84,7 +169,8 @@ def _is_single(obj): Return ------ (bool, object) - (is_single, new) tuple, where `new` yields the same sequence as `obj`. + 2-tuple ``(is_single_document, new_obj)`` tuple, where `new_obj` + yields the same sequence as the original `obj`. Notes ----- @@ -97,217 +183,238 @@ def _is_single(obj): peek = next(obj_iter) obj_iter = itertools.chain([peek], obj_iter) except StopIteration: - # An empty object is a single document + # An empty object is interpreted as a single document (not a corpus). return True, obj if isinstance(peek, str): - # It's a document, return the iterator + # First item is a string => obj is a single document for sure. return True, obj_iter if temp_iter is obj: - # Checking for iterator to the object + # An iterator / generator => interpret input as a corpus. return False, obj_iter - else: - # If the first item isn't a string, assume obj is a corpus - return False, obj + # If the first item isn't a string, assume obj is an iterable corpus. + return False, obj -class SentenceAnalyzer: - """Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`.""" - def score_item(self, worda, wordb, components, scorer): - """Get bi-gram score statistics. +class _PhrasesTransformation(interfaces.TransformationABC): + """ + Abstract base class for :class:`~gensim.models.phrases.Phrases` and + :class:`~gensim.models.phrases.FrozenPhrases`. - Parameters - ---------- - worda : str - First word of bi-gram. - wordb : str - Second word of bi-gram. - components : generator - Contain all phrases. - scorer : function - Scorer function, as given to :class:`~gensim.models.phrases.Phrases`. - See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`. + """ + def __init__(self, common_terms): + self.common_terms = frozenset(common_terms) + + def score_candidate(self, word_a, word_b, in_between): + """Score a single phrase candidate. Returns ------- - float - Score for given bi-gram. If bi-gram not present in dictionary - return -1. - + (str, float) + 2-tuple of ``(delimiter-joined phrase, phrase score)`` for a phrase, + or ``(None, None)`` if not a phrase. """ - vocab = self.vocab - if worda in vocab and wordb in vocab: - bigram = self.delimiter.join(components) - if bigram in vocab: - return scorer( - worda_count=float(vocab[worda]), - wordb_count=float(vocab[wordb]), - bigram_count=float(vocab[bigram])) - return -1 - - def analyze_sentence(self, sentence, threshold, common_terms, scorer): - """Analyze a sentence, detecting any bigrams that should be concatenated. + raise NotImplementedError("ABC: override this method in child classes") + + def analyze_sentence(self, sentence): + """Analyze a sentence, concatenating any detected phrases into a single token. Parameters ---------- sentence : iterable of str Token sequence representing the sentence to be analyzed. - threshold : float - The minimum score for a bigram to be taken into account. - common_terms : list of object - List of common terms, they receive special treatment. - scorer : function - Scorer function, as given to :class:`~gensim.models.phrases.Phrases`. - See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`. Yields ------ (str, score) - If bi-gram detected, a tuple where the first element is a detect bigram, second its score. - Otherwise, the first tuple element is a single word and second is None. + Iterate through the input sentence tokens and yield 2-tuples of: + - ``(concatenated_phrase_tokens, score)`` for token sequences that form a phrase. + - ``(word, None)`` if the token is not a part of a phrase. """ - s = [utils.any2utf8(w) for w in sentence] - # adding None is a trick that helps getting an automatic happy ending - # as it won't be a common_word, nor score - s.append(None) - last_uncommon = None - in_between = [] - for word in s: - is_common = word in common_terms - if not is_common and last_uncommon: - chain = [last_uncommon] + in_between + [word] - # test between last_uncommon - score = self.score_item( - worda=last_uncommon, - wordb=word, - components=chain, - scorer=scorer, - ) - if score > threshold: - yield (chain, score) - last_uncommon = None - in_between = [] + start_token, in_between = None, [] + for word in sentence: + if word not in self.common_terms: + # The current word is a normal token, not a stop word, which means it's a potential + # beginning (or end) of a phrase. + if start_token: + # We're inside a potential phrase, of which this word is the end. + phrase, score = self.score_candidate(start_token, word, in_between) + if score is not None: + # Phrase detected! + yield phrase, score + start_token, in_between = None, [] + else: + # Not a phrase after all. Dissolve the candidate's constituent tokens as individual words. + yield start_token, None + for w in in_between: + yield w, None + start_token, in_between = word, [] # new potential phrase starts here else: - # release words individually - for w in itertools.chain([last_uncommon], in_between): - yield (w, None) - in_between = [] - last_uncommon = word - elif not is_common: - last_uncommon = word - else: # common term - if last_uncommon: - # wait for uncommon resolution + # Not inside a potential bigram yet; start a new potential bigram here. + start_token, in_between = word, [] + else: # We're a stop word. + if start_token: + # We're inside a potential bigram: add the stopword and keep growing the phrase. in_between.append(word) else: - yield (word, None) + # Not inside a bigram: emit the stopword and move on. Phrases never begin with a stopword. + yield word, None + # Emit any non-phrase tokens at the end. + if start_token: + yield start_token, None + for w in in_between: + yield w, None + def __getitem__(self, sentence): + """Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent + tokens are replaced by a single token if they form a bigram collocation. -class PhrasesTransformation(interfaces.TransformationABC): - """Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`.""" + If `sentence` is an entire corpus (iterable of sentences rather than a single + sentence), return an iterable that converts each of the corpus' sentences + into phrases on the fly, one after another. + + Parameters + ---------- + sentence : {list of str, iterable of list of str} + Input sentence or a stream of sentences. + + Return + ------ + {list of str, iterable of list of str} + Sentence with phrase tokens joined by `self.delimiter` character, if input was a single sentence. + A generator of such joined sentences if input was a corpus. + + """ + is_single, sentence = _is_single(sentence) + if not is_single: + # If the input is an entire corpus (rather than a single sentence), + # return an iterable stream. + return self._apply(sentence) + + return [token for token, _ in self.analyze_sentence(sentence)] + + def export_phrases(self, sentences): + """Get all unique phrases (multi-word expressions) that appear in ``sentences``, and their scores. + + Parameters + ---------- + sentences : iterable of list of str + Text corpus. + + Returns + ------- + dict(str, float) + Unique phrases mapped to their scores. + + Example + ------- + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.models.phrases import Phrases + >>> + >>> sentences = Text8Corpus(datapath('testcorpus.txt')) + >>> phrases = Phrases(sentences, min_count=1, threshold=0.1) + >>> + >>> for phrase, score in phrases.export_phrases(sentences).items(): + ... print(phrase, score) + """ + result = {} + for sentence in sentences: + for phrase, score in self.analyze_sentence(sentence): + if score is not None: + result[phrase] = score + return result @classmethod def load(cls, *args, **kwargs): """Load a previously saved :class:`~gensim.models.phrases.Phrases` / - :class:`~gensim.models.phrases.Phraser` class. Handles backwards compatibility from older - :class:`~gensim.models.phrases.Phrases` / :class:`~gensim.models.phrases.Phraser` - versions which did not support pluggable scoring functions. + :class:`~gensim.models.phrases.FrozenPhrases` model. + + Handles backwards compatibility from older versions which did not support pluggable scoring functions. Parameters ---------- args : object - Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information. + See :class:`~gensim.utils.SaveLoad.load`. kwargs : object - Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information. + See :class:`~gensim.utils.SaveLoad.load`. """ - model = super(PhrasesTransformation, cls).load(*args, **kwargs) - # update older models - # if value in phrasegrams dict is a tuple, load only the scores. + model = super(_PhrasesTransformation, cls).load(*args, **kwargs) - for component, score in getattr(model, "phrasegrams", {}).items(): + # Upgrade FrozenPhrases + try: + phrasegrams = getattr(model, "phrasegrams", {}) + component, score = next(iter(phrasegrams.items())) if isinstance(score, tuple): - frequency, score_val = score - model.phrasegrams[component] = score_val - - # if no scoring parameter, use default scoring + # Value in phrasegrams used to be a tuple; keep only the 2nd tuple component = score. + model.phrasegrams = { + str(model.delimiter.join(key), encoding='utf8'): val[1] + for key, val in phrasegrams.items() + } + elif isinstance(component, tuple): # 3.8 => 4.0: phrasegram keys are strings, not tuples with bytestrings + model.phrasegrams = { + str(model.delimiter.join(component), encoding='utf8'): score + for key, val in phrasegrams.items() + } + except StopIteration: + # no phrasegrams, nothing to upgrade + pass + + # If no scoring parameter, use default scoring. if not hasattr(model, 'scoring'): - logger.info('older version of %s loaded without scoring function', cls.__name__) - logger.info('setting pluggable scoring method to original_scorer for compatibility') + logger.warning('older version of %s loaded without scoring function', cls.__name__) + logger.warning('setting pluggable scoring method to original_scorer for compatibility') model.scoring = original_scorer - # if there is a scoring parameter, and it's a text value, load the proper scoring function + # If there is a scoring parameter, and it's a text value, load the proper scoring function. if hasattr(model, 'scoring'): if isinstance(model.scoring, str): if model.scoring == 'default': - logger.info('older version of %s loaded with "default" scoring parameter', cls.__name__) - logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility') + logger.warning('older version of %s loaded with "default" scoring parameter', cls.__name__) + logger.warning('setting scoring method to original_scorer for compatibility') model.scoring = original_scorer elif model.scoring == 'npmi': - logger.info('older version of %s loaded with "npmi" scoring parameter', cls.__name__) - logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility') + logger.warning('older version of %s loaded with "npmi" scoring parameter', cls.__name__) + logger.warning('setting scoring method to npmi_scorer for compatibility') model.scoring = npmi_scorer else: - raise ValueError( - 'failed to load %s model with unknown scoring setting %s' % (cls.__name__, model.scoring)) - # if there is no common_terms attribute, initialize + raise ValueError(f'failed to load {cls.__name__} model, unknown scoring "{model.scoring}"') + # Initialize new attributes to default values. if not hasattr(model, "common_terms"): - logger.info('older version of %s loaded without common_terms attribute', cls.__name__) - logger.info('setting common_terms to empty set') + logger.warning( + 'older version of %s loaded without common_terms attribute, setting it to empty set', + cls.__name__, + ) model.common_terms = frozenset() - return model - - -def _sentence2token(phrase_class, sentence): - """ Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter. - This function is used by: meth:`~gensim.models.phrases.Phrases.__getitem__` and - meth:`~gensim.models.phrases.Phraser.__getitem__` - - Parameters - ---------- - phrase_class : - class:`~gensim.models.phrases.Phrases` or :class:`~gensim.models.phrases.Phraser` - sentence : {list of str, iterable of list of str} - Sentence or text corpus. - - Returns - ------- - {list of str, :class:`~gensim.interfaces.TransformedCorpus`} - `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences - if the input was a corpus. - - """ - is_single, sentence = _is_single(sentence) - if not is_single: - # if the input is an entire corpus (rather than a single sentence), - # return an iterable stream. - return phrase_class._apply(sentence) - - delimiter = phrase_class.delimiter - if hasattr(phrase_class, 'vocab'): - scorer = functools.partial( - phrase_class.scoring, - len_vocab=float(len(phrase_class.vocab)), - min_count=float(phrase_class.min_count), - corpus_word_count=float(phrase_class.corpus_word_count)) - else: - scorer = None - bigrams = phrase_class.analyze_sentence(sentence, threshold=phrase_class.threshold, - common_terms=phrase_class.common_terms, scorer=scorer) + if not hasattr(model, 'corpus_word_count'): + logger.warning('older version of %s loaded without corpus_word_count', cls.__name__) + logger.warning('setting corpus_word_count to 0, do not use it in your scoring function') + model.corpus_word_count = 0 - new_s = [] - for words, score in bigrams: - if score is not None: - words = delimiter.join(words) - new_s.append(words) - return [utils.to_unicode(w) for w in new_s] + # Before 4.0.0, we stored strings as UTF8 bytes internally, to save RAM. Since 4.0.0, we use strings. + if getattr(model, 'vocab', None): + word = next(iter(model.vocab)) # get a random key – any key will do + if not isinstance(word, str): + logger.info("old version of %s loaded, upgrading %i words in memory", cls.__name__, len(model.vocab)) + logger.info("re-save the loaded model to avoid this upgrade in the future") + vocab = defaultdict(int) + for key, value in model.vocab.items(): # needs lots of extra RAM temporarily! + vocab[str(key, encoding='utf8')] = value + model.vocab = vocab + if not isinstance(model.delimiter, str): + model.delimiter = str(model.delimiter, encoding='utf8') + return model -class Phrases(SentenceAnalyzer, PhrasesTransformation): +class Phrases(_PhrasesTransformation): """Detect phrases based on collocation counts.""" def __init__( self, sentences=None, min_count=5, threshold=10.0, - max_vocab_size=40000000, delimiter=b'_', progress_per=10000, + max_vocab_size=40000000, delimiter='_', progress_per=10000, scoring='default', common_terms=frozenset(), ): """ @@ -330,7 +437,7 @@ def __init__( to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease `max_vocab_size` depending on how much available memory you have. delimiter : str, optional - Glue character used to join collocation tokens, should be a byte string (e.g. b'_'). + Glue character used to join collocation tokens. scoring : {'default', 'npmi', function}, optional Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. @@ -359,9 +466,40 @@ def __init__( * corpus_word_count - the total number of tokens (non-unique) in `sentences` The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring. + The scoring function **must be pickleable**. + Examples + ---------- + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.models.phrases import Phrases + >>> + >>> # Load corpus and train a model. + >>> sentences = Text8Corpus(datapath('testcorpus.txt')) + >>> phrases = Phrases(sentences, min_count=1, threshold=1) + >>> + >>> # Use the model to detect phrases in a new sentence. + >>> sent = [u'trees', u'graph', u'minors'] + >>> print(phrases[sent]) + [u'trees_graph', u'minors'] + >>> + >>> # Or transform multiple sentences at once. + >>> sents = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']] + >>> for phrase in frozen_phrases[sents]: + ... print(phrase) + [u'trees_graph', u'minors'] + [u'graph_minors'] + >>> + >>> # Export a FrozenPhrases object that is more efficient but doesn't allow any more training. + >>> frozen_phrases = phrases.freeze() + >>> print(frozen_phrases[sent]) + [u'trees_graph', u'minors'] + """ + super().__init__(common_terms=common_terms) if min_count <= 0: raise ValueError("min_count should be at least 1") @@ -370,10 +508,9 @@ def __init__( if scoring == 'npmi' and (threshold < -1 or threshold > 1): raise ValueError("threshold should be between -1 and 1 for npmi scoring") - # set scoring based on string - # intentially override the value of the scoring parameter rather than set self.scoring here, - # to still run the check of scoring function parameters in the next code block - + # Set scoring based on string. + # Intentially override the value of the scoring parameter rather than set self.scoring here, + # to still run the check of scoring function parameters in the next code block. if isinstance(scoring, str): if scoring == 'default': scoring = original_scorer @@ -382,65 +519,45 @@ def __init__( else: raise ValueError(f'unknown scoring method string {scoring} specified') - scoring_parameters = [ + scoring_params = [ 'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count', ] if callable(scoring): - if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters): + missing = [param for param in scoring_params if param not in getargspec(scoring)[0]] + if not missing: self.scoring = scoring else: - raise ValueError('scoring function missing expected parameters') + raise ValueError(f'scoring function missing expected parameters {missing}') self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size - self.vocab = defaultdict(int) # mapping between utf8 token => its count + self.vocab = defaultdict(int) # mapping between token => its count self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per self.corpus_word_count = 0 - self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms) - # ensure picklability of custom scorer + # Ensure picklability of the scorer. try: pickle.loads(pickle.dumps(self.scoring)) except pickle.PickleError: - raise pickle.PickleError('Custom Phrases scoring function must be pickle-able') + raise pickle.PickleError(f'Custom scoring function in {self.__class__.__name__} must be pickle-able') if sentences is not None: self.add_vocab(sentences) - @classmethod - def load(cls, *args, **kwargs): - """Load a previously saved Phrases class. - Handles backwards compatibility from older Phrases versions which did not support pluggable scoring functions. - - Parameters - ---------- - args : object - Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information. - kwargs : object - Sequence of arguments, see :class:`~gensim.utils.SaveLoad.load` for more information. - - """ - model = super(Phrases, cls).load(*args, **kwargs) - if not hasattr(model, 'corpus_word_count'): - logger.info('older version of %s loaded without corpus_word_count', cls.__name__) - logger.info('Setting it to 0, do not use it in your scoring function.') - model.corpus_word_count = 0 - return model - def __str__(self): - """Get short string representation of this phrase detector.""" return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( self.__class__.__name__, len(self.vocab), self.min_count, self.threshold, self.max_vocab_size, ) @staticmethod - def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, - common_terms=frozenset()): - """Collect unigram/bigram counts from the `sentences` iterable. + def _learn_vocab( + sentences, max_vocab_size, delimiter='_', common_terms=frozenset(), progress_per=10000, + ): + """Collect unigram and bigram counts from the `sentences` iterable. Parameters ---------- @@ -451,62 +568,40 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, for such examples. max_vocab_size : int Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words, - to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease + to keep memory under control. 40M needs about 3.6GB of RAM. Increase/decrease `max_vocab_size` depending on how much available memory you have. delimiter : str, optional - Glue character used to join collocation tokens, should be a byte string (e.g. b'_'). - progress_per : int - Write logs every `progress_per` sentence. + Glue character used to join collocation tokens. common_terms : set of str, optional - List of "stop words" that won't affect frequency count of expressions containing them. - Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder". + List of "stop words" that won't affect frequency count of phrases containing them. + Allow to detect phrases like "bank_of_america" or "eye_of_the_beholder". + progress_per : int + Log progress once every `progress_per` sentences. Return ------ (int, dict of (str, int), int) - Number of pruned words, counters for each word/bi-gram and total number of words. - - Example - ---------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases - >>> - >>> sentences = Text8Corpus(datapath('testcorpus.txt')) - >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100) - >>> (pruned_words, total_words) - (1, 29) - >>> counters['computer'] - 2 - >>> counters['response_time'] - 1 + Number of pruned words, counters for each word/bi-gram, and total number of words. """ - sentence_no = -1 - total_words = 0 - logger.info("collecting all words and their counts") + sentence_no, total_words, min_reduce = -1, 0, 1 vocab = defaultdict(int) - min_reduce = 1 + logger.info("collecting all words and their counts") for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", sentence_no, total_words, len(vocab), ) - s = [utils.any2utf8(w) for w in sentence] - last_uncommon = None - in_between = [] - for word in s: + start_token, in_between = None, [] + for word in sentence: if word not in common_terms: vocab[word] += 1 - if last_uncommon is not None: - components = itertools.chain([last_uncommon], in_between, [word]) - vocab[delimiter.join(components)] += 1 - last_uncommon = word - in_between = [] - elif last_uncommon is not None: + if start_token is not None: + phrase_tokens = itertools.chain([start_token], in_between, [word]) + vocab[delimiter.join(phrase_tokens)] += 1 + start_token, in_between = word, [] # treat word as both end of a phrase AND beginning of another + elif start_token is not None: in_between.append(word) total_words += 1 @@ -515,13 +610,13 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, min_reduce += 1 logger.info( - "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences", - len(vocab), total_words, sentence_no + 1 + "collected %i token types (unigram + bigrams) from a corpus of %i words and %i sentences", + len(vocab), total_words, sentence_no + 1, ) return min_reduce, vocab, total_words def add_vocab(self, sentences): - """Update model with new `sentences`. + """Update model parameters with new `sentences`. Parameters ---------- @@ -535,7 +630,8 @@ def add_vocab(self, sentences): >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases - >>> # Create corpus and use it for phrase detector + >>> + >>> # Train a phrase detector from a text corpus. >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences) # train model >>> assert len(phrases.vocab) == 37 @@ -549,16 +645,18 @@ def add_vocab(self, sentences): >>> assert len(phrases.vocab) == 60 """ - # uses a separate vocab to collect the token counts from `sentences`. - # this consumes more RAM than merging new sentences into `self.vocab` + # Uses a separate vocab to collect the token counts from `sentences`. + # This consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accumulated # counts collected in previous learn_vocab runs. - min_reduce, vocab, total_words = self.learn_vocab( - sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms) + min_reduce, vocab, total_words = self._learn_vocab( + sentences, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, + progress_per=self.progress_per, common_terms=self.common_terms, + ) self.corpus_word_count += total_words - if len(self.vocab) > 0: + if self.vocab: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in vocab.items(): @@ -566,225 +664,59 @@ def add_vocab(self, sentences): if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 - logger.info("merged %s", self) else: - # in common case, avoid doubling gigantic dict - logger.info("using %i counts as vocab in %s", len(vocab), self) + # Optimization for a common case: the current vocab is empty, so apply + # the new vocab directly, no need to double it in memory. self.vocab = vocab - - def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): - """Get all phrases that appear in 'sentences' that pass the bigram threshold. - - Parameters - ---------- - sentences : iterable of list of str - Text corpus. - out_delimiter : str, optional - Delimiter used to "glue" together words that form a bigram phrase. - as_tuples : bool, optional - Yield `(tuple(words), score)` instead of `(out_delimiter.join(words), score)`? - - Yields - ------ - ((str, str), float) **or** (str, float) - Phrases detected in `sentences`. Return type depends on the `as_tuples` parameter. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases - >>> - >>> sentences = Text8Corpus(datapath('testcorpus.txt')) - >>> phrases = Phrases(sentences, min_count=1, threshold=0.1) - >>> - >>> for phrase, score in phrases.export_phrases(sentences): - ... pass - - """ - analyze_sentence = functools.partial( - self.analyze_sentence, - threshold=self.threshold, - common_terms=self.common_terms, - scorer=functools.partial( - self.scoring, - len_vocab=float(len(self.vocab)), - min_count=float(self.min_count), - corpus_word_count=float(self.corpus_word_count), - ), + logger.info("merged %s", self) + + def score_candidate(self, word_a, word_b, in_between): + # Micro optimization: check for quick early-out conditions, before the actual scoring. + word_a_cnt = self.vocab[word_a] + if word_a_cnt <= 0: + return None, None + + word_b_cnt = self.vocab[word_b] + if word_b_cnt <= 0: + return None, None + + phrase = self.delimiter.join([word_a] + in_between + [word_b]) + # XXX: Why do we care about *all* phrase tokens? Why not just score the start+end bigram? + phrase_cnt = self.vocab[phrase] + if phrase_cnt <= 0: + return None, None + + score = self.scoring( + worda_count=word_a_cnt, wordb_count=word_b_cnt, bigram_count=phrase_cnt, + len_vocab=len(self.vocab), min_count=self.min_count, corpus_word_count=self.corpus_word_count, ) - for sentence in sentences: - bigrams = analyze_sentence(sentence) - # keeps only not None scores - filtered = ((words, score) for words, score in bigrams if score is not None) - for words, score in filtered: - if as_tuples: - yield (tuple(words), score) - else: - yield (out_delimiter.join(words), score) + if score <= self.threshold: + return None, None - def __getitem__(self, sentence): - """Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter. + return phrase, score - If `sentence` is an entire corpus (iterable of sentences rather than a single - sentence), return an iterable that converts each of the corpus' sentences - into phrases on the fly, one after another. + def freeze(self): + """ + Return an object that contains the bare minimum of information while still allowing + phrase detection. See :class:`~gensim.models.phrases.FrozenPhrases`. - Parameters - ---------- - sentence : {list of str, iterable of list of str} - Sentence or text corpus. + Use this "frozen model" to dramatically reduce RAM footprint if you don't plan to + make any further changes to your `Phrases` model. Returns ------- - {list of str, :class:`gensim.interfaces.TransformedCorpus`} - `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences - if the input was a corpus. - - Examples - ---------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases, Phraser - >>> - >>> # Create corpus - >>> sentences = Text8Corpus(datapath('testcorpus.txt')) - >>> - >>> # Train the detector with: - >>> phrases = Phrases(sentences, min_count=1, threshold=1) - >>> # Input is a list of unicode strings: - >>> sent = [u'trees', u'graph', u'minors'] - >>> # Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1: - >>> print(phrases[sent]) - [u'trees_graph', u'minors'] - >>> - >>> sentences = Text8Corpus(datapath('testcorpus.txt')) - >>> phrases = Phrases(sentences, min_count=1, threshold=1) - >>> phraser = Phraser(phrases) # for speedup - >>> - >>> sent = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']] - >>> for phrase in phraser[sent]: - ... pass + :class:`~gensim.models.phrases.FrozenPhrases` + Exported object that's smaller, faster, but doesn't support model updates. """ - return _sentence2token(self, sentence) - - -def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations - of Words and Phrases and their Compositionality" `_. + return FrozenPhrases(self) - Parameters - ---------- - worda_count : int - Number of occurrences for first word. - wordb_count : int - Number of occurrences for second word. - bigram_count : int - Number of co-occurrences for phrase "worda_wordb". - len_vocab : int - Size of vocabulary. - min_count: int - Minimum collocation count threshold. - corpus_word_count : int - Not used in this particular scoring technique. - - Returns - ------- - float - Score for given bi-gram, greater than or equal to 0. - - Notes - ----- - Formula: :math:`\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`. - """ - return (bigram_count - min_count) / worda_count / wordb_count * len_vocab - - -def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - r"""Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction" - by Gerlof Bouma `_. - - Parameters - ---------- - worda_count : int - Number of occurrences for first word. - wordb_count : int - Number of occurrences for second word. - bigram_count : int - Number of co-occurrences for phrase "worda_wordb". - len_vocab : int - Not used. - min_count: int - Ignore all bigrams with total collected count lower than this value. - corpus_word_count : int - Total number of words in the corpus. - - Returns - ------- - float - Score for given bi-gram, in the range -1 to 1. - - Notes - ----- - Formula: :math:`\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`, - where :math:`prob(word) = \frac{word\_count}{corpus\_word\_count}` - - """ - if bigram_count >= min_count: - pa = worda_count / corpus_word_count - pb = wordb_count / corpus_word_count - pab = bigram_count / corpus_word_count - return log(pab / (pa * pb)) / -log(pab) - else: - # Return -infinity to make sure that no phrases will be created - # from bigrams less frequent than min_count - return float('-inf') - - -def pseudocorpus(source_vocab, sep, common_terms=frozenset()): - """Feeds `source_vocab`'s compound keys back to it, to discover phrases. - - Parameters - ---------- - source_vocab : iterable of list of str - Tokens vocabulary. - sep : str - Separator element. - common_terms : set, optional - Immutable set of stopwords. - - Yields - ------ - list of str - Phrase. - - """ - for k in source_vocab: - if sep not in k: - continue - unigrams = k.split(sep) - for i in range(1, len(unigrams)): - if unigrams[i - 1] not in common_terms: - # do not join common terms - cterms = list(itertools.takewhile(lambda w: w in common_terms, unigrams[i:])) - tail = unigrams[i + len(cterms):] - components = [sep.join(unigrams[:i])] + cterms - if tail: - components.append(sep.join(tail)) - yield components - - -class Phraser(SentenceAnalyzer, PhrasesTransformation): +class FrozenPhrases(_PhrasesTransformation): """Minimal state & functionality exported from :class:`~gensim.models.phrases.Phrases`. The goal of this class is to cut down memory consumption of `Phrases`, by discarding model state - not strictly needed for the bigram detection task. + not strictly needed for the phrase detection task. Use this instead of `Phrases` if you do not need to update the bigram statistics with new documents any more. @@ -796,27 +728,28 @@ def __init__(self, phrases_model): Parameters ---------- phrases_model : :class:`~gensim.models.phrases.Phrases` - Trained phrases instance. + Trained phrases instance, to extract all phrases from. Notes ----- - After the one-time initialization, a :class:`~gensim.models.phrases.Phraser` will be much smaller and somewhat - faster than using the full :class:`~gensim.models.phrases.Phrases` model. + After the one-time initialization, a :class:`~gensim.models.phrases.FrozenPhrases` will be much + smaller and faster than using the full :class:`~gensim.models.phrases.Phrases` model. Examples - -------- + ---------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases, Phraser + >>> from gensim.models.phrases import Phrases >>> + >>> # Load corpus and train a model. >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences, min_count=1, threshold=1) >>> - >>> bigram = Phraser(phrases) - >>> sent = [u'trees', u'graph', u'minors'] - >>> print(bigram[sent]) + >>> # Export a FrozenPhrases object that is more efficient but doesn't allow further training. + >>> frozen_phrases = phrases.freeze() + >>> print(frozen_phrases[sent]) [u'trees_graph', u'minors'] """ @@ -825,99 +758,43 @@ def __init__(self, phrases_model): self.delimiter = phrases_model.delimiter self.scoring = phrases_model.scoring self.common_terms = phrases_model.common_terms - corpus = self.pseudocorpus(phrases_model) - self.phrasegrams = {} - logger.info('source_vocab length %i', len(phrases_model.vocab)) - count = 0 - for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): - if bigram in self.phrasegrams: - logger.info('Phraser repeat %s', bigram) - self.phrasegrams[bigram] = score - count += 1 - if not count % 50000: - logger.info('Phraser added %i phrasegrams', count) - logger.info('Phraser built with %i phrasegrams', len(self.phrasegrams)) - - def pseudocorpus(self, phrases_model): - """Alias for :func:`gensim.models.phrases.pseudocorpus`. - - Parameters - ---------- - phrases_model : :class:`~gensim.models.phrases.Phrases` - Phrases instance. - - Return - ------ - generator - Generator with phrases. - - """ - return pseudocorpus(phrases_model.vocab, phrases_model.delimiter, phrases_model.common_terms) + logger.info('exporting phrases from %s', phrases_model) + self.phrasegrams = self._import_phrases(phrases_model) + logger.info('exported %s', self) - def score_item(self, worda, wordb, components, scorer): - """Score a bigram. + def __str__(self): + return "%s<%i phrases, min_count=%s, threshold=%s>" % ( + self.__class__.__name__, len(self.phrasegrams), self.min_count, self.threshold, + ) - Parameters - ---------- - worda : str - First word for comparison. - wordb : str - Second word for comparison. - components : generator - Contain phrases. - scorer : {'default', 'npmi'} - NOT USED. + def _import_phrases(self, phrases_model): + """Extract all phrases that pass the threshold out of `phrases_model`. Returns - ------- - float - Score for given bi-gram, if bi-gram not presented in dictionary - return -1. + ------ + dict[str, float] + Mapping between phrases and their scores. """ - try: - return self.phrasegrams[tuple(components)] - except KeyError: - return -1 - - def __getitem__(self, sentence): - """Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent - tokens are replaced by a single token if they form a bigram collocation. + result, source_vocab = {}, phrases_model.vocab + for token in source_vocab: + unigrams = token.split(self.delimiter) + if len(unigrams) < 2: + continue # no phrases here + phrase, score = phrases_model.score_candidate(unigrams[0], unigrams[-1], unigrams[1:-1]) + if score is not None: + result[phrase] = score + return result - Parameters - ---------- - sentence : {list of str, iterable of list of str} - Input sentence or a stream of sentences. - - Return - ------ - {list of str, iterable of list of str} - Sentence or sentences with phrase tokens joined by `self.delimiter` character. + def score_candidate(self, word_a, word_b, in_between): + phrase = self.delimiter.join([word_a] + in_between + [word_b]) + score = self.phrasegrams.get(phrase, NEGATIVE_INFINITY) + if score > self.threshold: + return phrase, score + return None, None - Examples - ---------- - .. sourcecode:: pycon - >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases, Phraser - >>> - >>> sentences = Text8Corpus(datapath('testcorpus.txt')) # Read corpus - >>> - >>> phrases = Phrases(sentences, min_count=1, threshold=1) # Train model - >>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase - >>> phraser_model = Phraser(phrases) - >>> - >>> sent = [u'trees', u'graph', u'minors'] - >>> print(phraser_model[sent]) - [u'trees_graph', u'minors'] - >>> sent = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']] - >>> for phrase in phraser_model[sent]: - ... print(phrase) - [u'trees_graph', u'minors'] - [u'graph_minors'] - - """ - return _sentence2token(self, sentence) +Phraser = FrozenPhrases # alias for backward compatibility if __name__ == '__main__': @@ -935,7 +812,6 @@ def __getitem__(self, sentence): from gensim.models.word2vec import Text8Corpus sentences = Text8Corpus(infile) - # test_doc = LineSentence('test/test_data/testcorpus.txt') bigram = Phrases(sentences, min_count=5, threshold=100) for s in bigram[sentences]: - print(utils.to_utf8(u' '.join(s))) + print(u' '.join(s)) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 1570acf224..4c04292473 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -27,12 +27,12 @@ >>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0] """ -from six import string_types + from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError from gensim import models -from gensim.models.phrases import Phraser +from gensim.models.phrases import FrozenPhrases class PhrasesTransformer(TransformerMixin, BaseEstimator): @@ -44,8 +44,10 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): `_. """ - def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter=b'_', progress_per=10000, scoring='default', common_terms=frozenset()): + def __init__( + self, min_count=5, threshold=10.0, max_vocab_size=40000000, + delimiter='_', progress_per=10000, scoring='default', common_terms=frozenset(), + ): """ Parameters @@ -58,7 +60,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default of 40M needs about 3.6GB of RAM. delimiter : str, optional - Character used to join collocation tokens, should be a byte string (e.g. b'_'). + Character used to join collocation tokens (e.g. '_'). progress_per : int, optional Training will report to the logger every that many phrases are learned. scoring : str or function, optional @@ -127,7 +129,7 @@ def fit(self, X, y=None): max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms ) - self.phraser = Phraser(self.gensim_model) + self.phraser = FrozenPhrases(self.gensim_model) return self def transform(self, docs): @@ -152,10 +154,10 @@ def transform(self, docs): ) if self.phraser is None: - self.phraser = Phraser(self.gensim_model) + self.phraser = FrozenPhrases(self.gensim_model) # input as python lists - if isinstance(docs[0], string_types): + if isinstance(docs[0], str): docs = [docs] return [self.phraser[doc] for doc in docs] @@ -186,5 +188,5 @@ def partial_fit(self, X): ) self.gensim_model.add_vocab(X) - self.phraser = Phraser(self.gensim_model) + self.phraser = FrozenPhrases(self.gensim_model) return self diff --git a/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl deleted file mode 100644 index 7799418058..0000000000 Binary files a/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl and /dev/null differ diff --git a/gensim/test/test_data/phrases-transformer-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-v3-5-0.pkl deleted file mode 100644 index 8ffef6763b..0000000000 Binary files a/gensim/test/test_data/phrases-transformer-v3-5-0.pkl and /dev/null differ diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index ed85fea2b5..9c7a73cae4 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -4,140 +4,97 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -Automated tests for checking transformation algorithms (the models package). +Automated tests for the phrase detection module. """ - import logging import unittest -import six import numpy as np -from gensim.utils import to_unicode -from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser -from gensim.models.phrases import pseudocorpus, original_scorer +from gensim.models.phrases import Phrases, FrozenPhrases, _PhrasesTransformation +from gensim.models.phrases import original_scorer from gensim.test.utils import common_texts, temporary_file, datapath -class TestUtils(unittest.TestCase): - - def test_pseudocorpus_no_common_terms(self): - vocab = [ - "prime_minister", - "gold", - "chief_technical_officer", - "effective"] - result = list(pseudocorpus(vocab, "_")) - self.assertEqual( - result, - [["prime", "minister"], - ["chief", "technical_officer"], - ["chief_technical", "officer"]]) - - def test_pseudocorpus_with_common_terms(self): - vocab = [ - "hall_of_fame", - "gold", - "chief_of_political_bureau", - "effective", - "beware_of_the_dog_in_the_yard"] - common_terms = frozenset(["in", "the", "of"]) - result = list(pseudocorpus(vocab, "_", common_terms=common_terms)) - self.assertEqual( - result, - [["hall", "of", "fame"], - ["chief", "of", "political_bureau"], - ["chief_of_political", "bureau"], - ["beware", "of", "the", "dog_in_the_yard"], - ["beware_of_the_dog", "in", "the", "yard"]]) - - class TestPhraseAnalysis(unittest.TestCase): - class AnalysisTester(SentenceAnalyzer): + class AnalysisTester(_PhrasesTransformation): - def __init__(self, scores): + def __init__(self, scores, threshold): + super().__init__(common_terms={"a", "the", "with", "of"}) self.scores = scores + self.threshold = threshold - def score_item(self, worda, wordb, components, scorer): - """Override for test purpose""" - if worda is not None and wordb is not None: - bigram_word = b"_".join(components) - return self.scores.get(bigram_word, -1) - else: - return -1 - - def analyze(self, scores, sentence): - analyzer = self.AnalysisTester(scores) - return list(analyzer.analyze_sentence( - sentence, - threshold=1, - common_terms={b"a", b"the", b"with", b"of"}, - scorer=None)) - - def analyze_words(self, scores, sentence): - result = ( - w if isinstance(w, (tuple, list)) else [w] - for w, score in self.analyze(scores, sentence)) - return [b"_".join(w).decode("utf-8") for w in result] + def score_candidate(self, word_a, word_b, in_between): + phrase = "_".join([word_a] + in_between + [word_b]) + score = self.scores.get(phrase, -1) + if score > self.threshold: + return phrase, score + return None, None def test_simple_analysis(self): - s = ["simple", "sentence", "should", "pass"] - result = self.analyze_words({}, s) - self.assertEqual(result, s) - s = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"] - result = self.analyze_words({}, s) - self.assertEqual(result, s) + """Test transformation with no phrases.""" + sentence = ["simple", "sentence", "should", "pass"] + result = self.AnalysisTester({}, threshold=1)[sentence] + self.assertEqual(result, sentence) + sentence = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"] + result = self.AnalysisTester({}, threshold=1)[sentence] + self.assertEqual(result, sentence) def test_analysis_bigrams(self): scores = { - b"simple_sentence": 2, b"sentence_many": 2, - b"many_possible": 2, b"possible_bigrams": 2} - s = ["simple", "sentence", "many", "possible", "bigrams"] - result = self.analyze_words(scores, s) + "simple_sentence": 2, "sentence_many": 2, + "many_possible": 2, "possible_bigrams": 2, + } + sentence = ["simple", "sentence", "many", "possible", "bigrams"] + result = self.AnalysisTester(scores, threshold=1)[sentence] self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"]) - s = ["some", "simple", "sentence", "many", "bigrams"] - result = self.analyze_words(scores, s) + sentence = ["some", "simple", "sentence", "many", "bigrams"] + result = self.AnalysisTester(scores, threshold=1)[sentence] self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"]) - s = ["some", "unrelated", "simple", "words"] - result = self.analyze_words(scores, s) - self.assertEqual(result, s) + sentence = ["some", "unrelated", "simple", "words"] + result = self.AnalysisTester(scores, threshold=1)[sentence] + self.assertEqual(result, sentence) def test_analysis_common_terms(self): scores = { - b"simple_sentence": 2, b"sentence_many": 2, - b"many_possible": 2, b"possible_bigrams": 2} - s = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"] - result = self.analyze_words(scores, s) + "simple_sentence": 2, "sentence_many": 2, + "many_possible": 2, "possible_bigrams": 2, + } + sentence = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"] + result = self.AnalysisTester(scores, threshold=1)[sentence] self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"]) - s = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"] - result = self.analyze_words(scores, s) - self.assertEqual(result, [ - "simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"]) + sentence = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"] + result = self.AnalysisTester(scores, threshold=1)[sentence] + self.assertEqual( + result, + ["simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"], + ) def test_analysis_common_terms_in_between(self): scores = { - b"simple_sentence": 2, b"sentence_with_many": 2, - b"many_possible": 2, b"many_of_the_possible": 2, b"possible_bigrams": 2} - s = ["sentence", "with", "many", "possible", "bigrams"] - result = self.analyze_words(scores, s) + "simple_sentence": 2, "sentence_with_many": 2, + "many_possible": 2, "many_of_the_possible": 2, "possible_bigrams": 2, + } + sentence = ["sentence", "with", "many", "possible", "bigrams"] + result = self.AnalysisTester(scores, threshold=1)[sentence] self.assertEqual(result, ["sentence_with_many", "possible_bigrams"]) - s = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"] - result = self.analyze_words(scores, s) + sentence = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"] + result = self.AnalysisTester(scores, threshold=1)[sentence] self.assertEqual( result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"]) class PhrasesData: + sentences = common_texts + [ - ['graph', 'minors', 'survey', 'human', 'interface'] + ['graph', 'minors', 'survey', 'human', 'interface'], ] - unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences] common_terms = frozenset() bigram1 = u'response_time' @@ -148,24 +105,18 @@ def gen_sentences(self): return ((w for w in sentence) for sentence in self.sentences) -class PhrasesCommon: - """ Tests that need to be run for both Phrases and Phraser classes.""" +class PhrasesCommon(PhrasesData): + """Tests for both Phrases and FrozenPhrases classes.""" def setUp(self): - self.bigram = Phrases( - self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - self.bigram_default = Phrases( - self.sentences, common_terms=self.common_terms) - self.bigram_utf8 = Phrases( - self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - self.bigram_unicode = Phrases( - self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) + self.bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + self.bigram_default = Phrases(self.sentences, common_terms=self.common_terms) def testEmptyPhrasifiedSentencesIterator(self): bigram_phrases = Phrases(self.sentences) - bigram_phraser = Phraser(bigram_phrases) + bigram_phraser = FrozenPhrases(bigram_phrases) trigram_phrases = Phrases(bigram_phraser[self.sentences]) - trigram_phraser = Phraser(trigram_phrases) + trigram_phraser = FrozenPhrases(trigram_phrases) trigrams = trigram_phraser[bigram_phraser[self.sentences]] fst, snd = list(trigrams), list(trigrams) self.assertEqual(fst, snd) @@ -187,22 +138,27 @@ def testEmptyInputsOnBigramConstruction(self): def testSentenceGeneration(self): """Test basic bigram using a dummy corpus.""" # test that we generate the same amount of sentences as the input - self.assertEqual(len(self.sentences), len(list(self.bigram_default[self.sentences]))) + self.assertEqual( + len(self.sentences), + len(list(self.bigram_default[self.sentences])), + ) def testSentenceGenerationWithGenerator(self): """Test basic bigram production when corpus is a generator.""" - self.assertEqual(len(list(self.gen_sentences())), - len(list(self.bigram_default[self.gen_sentences()]))) + self.assertEqual( + len(list(self.gen_sentences())), + len(list(self.bigram_default[self.gen_sentences()])), + ) def testBigramConstruction(self): - """Test Phrases bigram construction building.""" + """Test Phrases bigram construction.""" # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False - for s in self.bigram[self.sentences]: - if not bigram1_seen and self.bigram1 in s: + for sentence in self.bigram[self.sentences]: + if not bigram1_seen and self.bigram1 in sentence: bigram1_seen = True - if not bigram2_seen and self.bigram2 in s: + if not bigram2_seen and self.bigram2 in sentence: bigram2_seen = True if bigram1_seen and bigram2_seen: break @@ -218,7 +174,7 @@ def testBigramConstruction(self): self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]]) def testBigramConstructionFromGenerator(self): - """Test Phrases bigram construction building when corpus is a generator""" + """Test Phrases bigram construction building when corpus is a generator.""" bigram1_seen = False bigram2_seen = False @@ -232,7 +188,7 @@ def testBigramConstructionFromGenerator(self): self.assertTrue(bigram1_seen and bigram2_seen) def testBigramConstructionFromArray(self): - """Test Phrases bigram construction building when corpus is a numpy array""" + """Test Phrases bigram construction building when corpus is a numpy array.""" bigram1_seen = False bigram2_seen = False @@ -245,16 +201,6 @@ def testBigramConstructionFromArray(self): break self.assertTrue(bigram1_seen and bigram2_seen) - def testEncoding(self): - """Test that both utf8 and unicode input work; output must be unicode.""" - expected = [u'survey', u'user', u'computer', u'system', u'response_time'] - - self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) - self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) - - transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) - self.assertTrue(isinstance(transformed, six.text_type)) - # scorer for testCustomScorer # function is outside of the scope of the test because for picklability of custom scorer @@ -264,43 +210,32 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co return 1 -class TestPhrasesModel(PhrasesData, PhrasesCommon, unittest.TestCase): +class TestPhrasesModel(PhrasesCommon, unittest.TestCase): def testExportPhrases(self): - """Test Phrases bigram export_phrases functionality.""" - bigram = Phrases(self.sentences, min_count=1, threshold=1) - - seen_bigrams = set() - - for phrase, score in bigram.export_phrases(self.sentences): - seen_bigrams.add(phrase) + """Test Phrases bigram export phrases.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') + seen_bigrams = set(bigram.export_phrases(self.sentences).keys()) assert seen_bigrams == { - b'response time', - b'graph minors', - b'human interface', + 'response time', + 'graph minors', + 'human interface', } def testMultipleBigramsSingleEntry(self): - """ a single entry should produce multiple bigrams. """ - bigram = Phrases(self.sentences, min_count=1, threshold=1) - seen_bigrams = set() - + """Test a single entry produces multiple bigrams.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_bigrams.add(phrase) + seen_bigrams = set(bigram.export_phrases(test_sentences).keys()) - assert seen_bigrams == {b'graph minors', b'human interface'} + assert seen_bigrams == {'graph minors', 'human interface'} def testScoringDefault(self): - """ test the default scoring, from the mikolov word2vec paper """ - bigram = Phrases(self.sentences, min_count=1, threshold=1) - - seen_scores = set() - + """Test the default scoring, from the mikolov word2vec paper.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values()) assert seen_scores == { 5.167, # score for graph minors @@ -308,22 +243,18 @@ def testScoringDefault(self): } def test__getitem__(self): - """ test Phrases[sentences] with a single sentence""" + """Test Phrases[sentences] with a single sentence.""" bigram = Phrases(self.sentences, min_count=1, threshold=1) - # pdb.set_trace() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] phrased_sentence = next(bigram[test_sentences].__iter__()) assert phrased_sentence == ['graph_minors', 'survey', 'human_interface'] def testScoringNpmi(self): - """ test normalized pointwise mutual information scoring """ + """Test normalized pointwise mutual information scoring.""" bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') - - seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values()) assert seen_scores == { .882, # score for graph minors @@ -331,16 +262,12 @@ def testScoringNpmi(self): } def testCustomScorer(self): - """ test using a custom scoring function """ - + """Test using a custom scoring function.""" bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) - - seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.append(score) + seen_scores = list(bigram.export_phrases(test_sentences).values()) - assert all(seen_scores) # all scores 1 + assert all(score == 1 for score in seen_scores) assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' def testBadParameters(self): @@ -361,31 +288,25 @@ def testPruning(self): class TestPhrasesPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): - """ saving and loading a Phrases object with a custom scorer """ - + """Test saving and loading a Phrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) - seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram_loaded.export_phrases(test_sentences): - seen_scores.append(score) + seen_scores = list(bigram_loaded.export_phrases(test_sentences).values()) - assert all(seen_scores) # all scores 1 + assert all(score == 1 for score in seen_scores) assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' def testSaveLoad(self): - """ Saving and loading a Phrases object.""" - + """Test saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) - seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram_loaded.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram_loaded.export_phrases(test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors @@ -393,13 +314,10 @@ def testSaveLoad(self): ]) def testSaveLoadStringScoring(self): - """ Saving and loading a Phrases object with a string scoring parameter. - This should ensure backwards compatibility with the previous version of Phrases""" + """Test backwards compatibility with a previous version of Phrases with custom scoring.""" bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) - seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram_loaded.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram_loaded.export_phrases(test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors @@ -407,14 +325,10 @@ def testSaveLoadStringScoring(self): ]) def testSaveLoadNoScoring(self): - """ Saving and loading a Phrases object with no scoring parameter. - This should ensure backwards compatibility with old versions of Phrases""" - + """Test backwards compatibility with old versions of Phrases with no scoring parameter.""" bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl")) - seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram_loaded.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram_loaded.export_phrases(test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors @@ -426,77 +340,67 @@ def testSaveLoadNoCommonTerms(self): bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 - phraser = Phraser(bigram_loaded) # does not raise + phraser = FrozenPhrases(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise -class TestPhraserPersistence(PhrasesData, unittest.TestCase): +class TestFrozenPhrasesPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): - """Saving and loading a Phraser object with a custom scorer """ + """Test saving and loading a FrozenPhrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: - bigram = Phraser( + bigram = FrozenPhrases( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) - bigram_loaded = Phraser.load(fpath) - # we do not much with scoring, just verify its the one expected + bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual(bigram_loaded.scoring, dumb_scorer) def testSaveLoad(self): - """ Saving and loading a Phraser object.""" + """Test saving and loading a FrozenPhrases object.""" with temporary_file("test.pkl") as fpath: - bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) + bigram = FrozenPhrases(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) - bigram_loaded = Phraser.load(fpath) + bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system']) def testSaveLoadStringScoring(self): - """ Saving and loading a Phraser object with a string scoring parameter. - This should ensure backwards compatibility with the previous version of Phraser""" - bigram_loaded = Phraser.load(datapath("phraser-scoring-str.pkl")) + """Test saving and loading a FrozenPhrases object with a string scoring parameter. + This should ensure backwards compatibility with the previous version of FrozenPhrases""" + bigram_loaded = FrozenPhrases.load(datapath("phraser-scoring-str.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoScoring(self): - """ Saving and loading a Phraser object with no scoring parameter. - This should ensure backwards compatibility with old versions of Phraser""" - bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl")) + """Test saving and loading a FrozenPhrases object with no scoring parameter. + This should ensure backwards compatibility with old versions of FrozenPhrases""" + bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoCommonTerms(self): - """ Ensure backwards compatibility with old versions of Phraser, before common_terms""" - bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl")) + """Ensure backwards compatibility with old versions of FrozenPhrases, before common_terms.""" + bigram_loaded = FrozenPhrases.load(datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) -class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): - """ Test Phraser models.""" +class TestFrozenPhrasesModel(PhrasesCommon, unittest.TestCase): + """Test FrozenPhrases models.""" def setUp(self): - """Set up Phraser models for the tests.""" + """Set up FrozenPhrases models for the tests.""" bigram_phrases = Phrases( self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - self.bigram = Phraser(bigram_phrases) + self.bigram = FrozenPhrases(bigram_phrases) bigram_default_phrases = Phrases(self.sentences, common_terms=self.common_terms) - self.bigram_default = Phraser(bigram_default_phrases) - - bigram_utf8_phrases = Phrases( - self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - self.bigram_utf8 = Phraser(bigram_utf8_phrases) - - bigram_unicode_phrases = Phrases( - self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) - self.bigram_unicode = Phraser(bigram_unicode_phrases) + self.bigram_default = FrozenPhrases(bigram_default_phrases) class CommonTermsPhrasesData: - """This mixin permits to reuse the test, using, this time the common_terms option - """ + """This mixin permits to reuse tests with the common_terms option.""" sentences = [ ['human', 'interface', 'with', 'computer'], @@ -510,7 +414,6 @@ class CommonTermsPhrasesData: ['data', 'and', 'graph', 'survey'], ['data', 'and', 'graph', 'survey', 'for', 'human', 'interface'] # test bigrams within same sentence ] - unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences] common_terms = ['of', 'and', 'for'] bigram1 = u'lack_of_interest' @@ -527,63 +430,43 @@ def gen_sentences(self): class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel): """Test Phrases models with common terms""" - def testEncoding(self): - """Test that both utf8 and unicode input work; output must be unicode.""" - expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] - - self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) - self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) - - transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) - self.assertTrue(isinstance(transformed, six.text_type)) - def testMultipleBigramsSingleEntry(self): - """ a single entry should produce multiple bigrams. """ - bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - - seen_bigrams = set() + """Test a single entry produces multiple bigrams.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms, delimiter=' ') test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_bigrams.add(phrase) + seen_bigrams = set(bigram.export_phrases(test_sentences).keys()) + assert seen_bigrams == set([ - b'data and graph', - b'human interface', + 'data and graph', + 'human interface', ]) def testExportPhrases(self): - """Test Phrases bigram export_phrases functionality.""" - bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - - seen_bigrams = set() - - for phrase, score in bigram.export_phrases(self.sentences): - seen_bigrams.add(phrase) + """Test Phrases bigram export phrases.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms, delimiter=' ') + seen_bigrams = set(bigram.export_phrases(self.sentences).keys()) assert seen_bigrams == set([ - b'human interface', - b'graph of trees', - b'data and graph', - b'lack of interest', + 'human interface', + 'graph of trees', + 'data and graph', + 'lack of interest', ]) def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - - seen_scores = set() - test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values()) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) - graph = float(bigram.vocab[b"graph"]) - data = float(bigram.vocab[b"data"]) - data_and_graph = float(bigram.vocab[b"data_and_graph"]) - human = float(bigram.vocab[b"human"]) - interface = float(bigram.vocab[b"interface"]) - human_interface = float(bigram.vocab[b"human_interface"]) + graph = float(bigram.vocab["graph"]) + data = float(bigram.vocab["data"]) + data_and_graph = float(bigram.vocab["data_and_graph"]) + human = float(bigram.vocab["human"]) + interface = float(bigram.vocab["interface"]) + human_interface = float(bigram.vocab["human_interface"]) assert seen_scores == set([ # score for data and graph @@ -593,15 +476,13 @@ def testScoringDefault(self): ]) def testScoringNpmi(self): - """ test normalized pointwise mutual information scoring """ - bigram = Phrases(self.sentences, min_count=1, threshold=.5, - scoring='npmi', common_terms=self.common_terms) - - seen_scores = set() - + """Test normalized pointwise mutual information scoring.""" + bigram = Phrases( + self.sentences, min_count=1, threshold=.5, + scoring='npmi', common_terms=self.common_terms, + ) test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + seen_scores = set(round(score, 3) for score in bigram.export_phrases(test_sentences).values()) assert seen_scores == set([ .74, # score for data and graph @@ -609,56 +490,35 @@ def testScoringNpmi(self): ]) def testCustomScorer(self): - """ test using a custom scoring function """ - - bigram = Phrases(self.sentences, min_count=1, threshold=.001, - scoring=dumb_scorer, common_terms=self.common_terms) - - seen_scores = [] + """Test using a custom scoring function.""" + bigram = Phrases( + self.sentences, min_count=1, threshold=.001, + scoring=dumb_scorer, common_terms=self.common_terms, + ) test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] - for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.append(score) + seen_scores = list(bigram.export_phrases(test_sentences).values()) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human' def test__getitem__(self): - """ test Phrases[sentences] with a single sentence""" + """Test Phrases[sentences] with a single sentence.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) - # pdb.set_trace() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] phrased_sentence = next(bigram[test_sentences].__iter__()) assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface'] -class TestPhraserModelCommonTerms(CommonTermsPhrasesData, TestPhraserModel): - - def testEncoding(self): - """Test that both utf8 and unicode input work; output must be unicode.""" - expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] - - self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) - self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) - - transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) - self.assertTrue(isinstance(transformed, six.text_type)) - - -class TestPhraserModelCompatibilty(unittest.TestCase): +class TestFrozenPhrasesModelCompatibilty(unittest.TestCase): def testCompatibilty(self): - phr = Phraser.load(datapath("phraser-3.6.0.model")) - model = Phrases.load(datapath("phrases-3.6.0.model")) - + phrases = Phrases.load(datapath("phrases-3.6.0.model")) + phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] - expected_res = ['trees', 'graph_minors'] - - phr_out = phr[test_sentences] - model_out = model[test_sentences] - self.assertEqual(phr_out, expected_res) - self.assertEqual(model_out, expected_res) + self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors']) + self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors']) if __name__ == '__main__': diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 9dc7d303eb..b6b9449eb4 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1137,7 +1137,7 @@ def testPartialFit(self): new_sentences = [ ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'], ['world', 'peace', 'people'], - ['world', 'peace', 'humans'] + ['world', 'peace', 'humans'], ] self.model.partial_fit(X=new_sentences) # train model with new sentences @@ -1182,30 +1182,6 @@ def setUp(self): [u'the', u'bank_of_america', u'offices', u'are', u'closed'] ] - def testCompareToOld(self): - with open(datapath("phrases-transformer-v3-5-0.pkl"), "rb") as old_phrases_transformer_pkl: - old_phrases_transformer = pickle.load(old_phrases_transformer_pkl) - doc = phrases_sentences[-1] - phrase_tokens = old_phrases_transformer.transform(doc)[0] - expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] - self.assertEqual(phrase_tokens, expected_phrase_tokens) - - self.model.fit(phrases_sentences) - new_phrase_tokens = self.model.transform(doc)[0] - self.assertEqual(new_phrase_tokens, phrase_tokens) - - def testLoadNew(self): - with open(datapath("phrases-transformer-new-v3-5-0.pkl"), "rb") as new_phrases_transformer_pkl: - old_phrases_transformer = pickle.load(new_phrases_transformer_pkl) - doc = phrases_sentences[-1] - phrase_tokens = old_phrases_transformer.transform(doc)[0] - expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] - self.assertEqual(phrase_tokens, expected_phrase_tokens) - - self.model.fit(phrases_sentences) - new_phrase_tokens = self.model.transform(doc)[0] - self.assertEqual(new_phrase_tokens, phrase_tokens) - def testFitAndTransform(self): self.model.fit(phrases_w_common_terms) @@ -1247,10 +1223,7 @@ def testPartialFit(self): self.assertEqual(transformed_2, expected_transformations_2) -# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter - -# this is intentionally in main rather than a class method to support pickling -# all scores will be 1 +# For testing pluggable scoring in Phrases – must remain pickleable. def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return 1