From e2fc412a98706c33f6be989c1899b4abea327746 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 18 Jul 2017 16:15:20 +0100 Subject: [PATCH] adding common terms to phrases model --- docs/notebooks/wikinews-bigram-en.ipynb | 397 ++++++++++++++++++++++++ gensim/models/phrases.py | 314 +++++++++++-------- gensim/test/test_phrases.py | 369 ++++++++++++++++++---- 3 files changed, 900 insertions(+), 180 deletions(-) create mode 100644 docs/notebooks/wikinews-bigram-en.ipynb diff --git a/docs/notebooks/wikinews-bigram-en.ipynb b/docs/notebooks/wikinews-bigram-en.ipynb new file mode 100644 index 0000000000..89ef9c3ec0 --- /dev/null +++ b/docs/notebooks/wikinews-bigram-en.ipynb @@ -0,0 +1,397 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Illustrating common terms usage using Wikinews in english" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## getting data\n", + "\n", + "We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "LANG=\"english\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "fdate=20170327\n", + "fname=enwikinews-$fdate-cirrussearch-content.json.gz\n", + "if [ ! -e $fname ]\n", + "then\n", + " wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\n", + "fi\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# iterator\n", + "import gzip\n", + "import json\n", + "\n", + "FDATE = 20170327\n", + "FNAME = \"enwikinews-%s-cirrussearch-content.json.gz\" % FDATE\n", + "\n", + "def iter_texts(fpath=FNAME):\n", + " with gzip.open(fpath, \"rt\") as f:\n", + " for l in f:\n", + " data = json.loads(l)\n", + " if \"title\" in data:\n", + " yield data[\"title\"]\n", + " yield data[\"text\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# also prepare nltk\n", + "import nltk\n", + "nltk.download(\"punkt\")\n", + "nltk.download(\"stopwords\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing data\n", + "\n", + "we arrange the corpus as required by gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# make a custom tokenizer\n", + "import re\n", + "from nltk.tokenize import sent_tokenize\n", + "from nltk.tokenize import RegexpTokenizer\n", + "tokenizer = RegexpTokenizer('\\w[\\w-]*|\\d[\\d,]*')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# prepare a text\n", + "def prepare(txt):\n", + " # lower case\n", + " txt = txt.lower()\n", + " return [tokenizer.tokenize(sent) \n", + " for sent in sent_tokenize(txt, language=LANG)]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# we put all data in ram, it's not so much\n", + "corpus = []\n", + "for txt in iter_texts():\n", + " corpus.extend(prepare(txt))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus has 1003521 words in 46159 sentences\n" + ] + } + ], + "source": [ + "# how many sentences and words ?\n", + "words_count = sum(len(s) for s in corpus)\n", + "print(\"Corpus has %d words in %d sentences\" % (words_count, len(corpus)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing bigram with and without common terms\n", + "\n", + "The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them.\n", + "While you could remove them, you may information, for *\"the president is in america\"* is not the same as *\"the president of america\"*\n", + "\n", + "The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from gensim.models.phrases import Phrases" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now d ll m o re ve y ain aren couldn didn doesn hadn hasn haven isn ma mightn mustn needn shan shouldn wasn weren won wouldn'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# which are the stop words we will use\n", + "from nltk.corpus import stopwords\n", + "\" \".join(stopwords.words(LANG))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# a version of corups without stop words\n", + "stop_words = frozenset(stopwords.words(LANG))\n", + "def stopwords_filter(txt):\n", + " return [w for w in txt if w not in stop_words]\n", + "st_corpus = [stopwords_filter(txt) for txt in corpus]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.33 s, sys: 16 ms, total: 1.34 s\n", + "Wall time: 1.34 s\n", + "CPU times: user 1.64 s, sys: 24 ms, total: 1.67 s\n", + "Wall time: 1.67 s\n" + ] + } + ], + "source": [ + "# bigram std\n", + "%time bigram = Phrases(st_corpus)\n", + "# bigram with common terms\n", + "%time bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bigram with common terms inside\n", + "\n", + "What are (some of) the bigram founds thanks to common terms" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "510 grams with common terms found\n" + ] + }, + { + "data": { + "text/plain": [ + "[(5339.47619047619, 'borussia m gladbach'),\n", + " (5460.194782608696, 'billboard in jakarta'),\n", + " (5606.450000000001, 'christ of latter-day'),\n", + " (5862.954248366013, 'skull and bones'),\n", + " (6006.910714285714, 'preserved in amber'),\n", + " (6129.452168746287, 'aisyah and doan'),\n", + " (6158.114416475973, 'funded by your generous'),\n", + " (6407.371428571429, 'restored as burkina'),\n", + " (7081.831578947369, 'click on the donate'),\n", + " (7234.129032258064, 'qatar of intervening'),\n", + " (7377.621673923561, 'sinks in suva'),\n", + " (8146.123931623933, 'lahm to hang'),\n", + " (8163.0819009100105, 'istanbul s ataturk'),\n", + " (8305.851851851852, 'derails in tabasco'),\n", + " (9060.929292929293, 'poet of apostasy'),\n", + " (9593.925133689841, 'creator of kinder'),\n", + " (10512.09375, 'consulate in irbil'),\n", + " (12176.904977375565, 'newsworthy and entertaining'),\n", + " (15829.976470588235, 'santos over nepotism'),\n", + " (16272.689342403628, 'hotness of bhut')]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# grams that have more than 2 terms, are those with common terms\n", + "ct_ngrams = set((g[1], g[0].decode(\"utf-8\"))\n", + " for g in bigram_ct.export_phrases(corpus) \n", + " if len(g[0].split()) > 2)\n", + "ct_ngrams = sorted(list(ct_ngrams))\n", + "print(len(ct_ngrams), \"grams with common terms found\")\n", + "# highest scores\n", + "ct_ngrams[-20:]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "location-united : ['location of the united', 'location of united']\n", + "magnitude-6 : ['magnitude 6', 'magnitude of 6']\n", + "tuition-fees : ['tuition and fees', 'tuition fees']\n", + "pleaded-guilty : ['pleaded not guilty', 'pleaded guilty']\n", + "found-guilty : ['found not guilty', 'found guilty']\n", + "france-germany : ['france germany', 'france and germany']\n", + "earlier-week : ['earlier this week', 'earlier in the week']\n", + "since-2003 : ['since 2003', 'since the 2003']\n", + "contact-admissions : ['contact the admissions', 'contact admissions']\n", + "created-text : ['created from text', 'created from the text']\n", + "external-inter-wiki : ['external and inter-wiki', 'external inter-wiki']\n" + ] + } + ], + "source": [ + "# did we found any bigram with same words but different stopwords\n", + "import collections\n", + "by_terms = collections.defaultdict(set)\n", + "for ngram, score in bigram_ct.export_phrases(corpus):\n", + " grams = ngram.split()\n", + " by_terms[(grams[0], grams[-1])].add(ngram)\n", + "for k, v in by_terms.items():\n", + " if len(v) > 1:\n", + " print(b\"-\".join(k).decode(\"utf-8\"),\" : \", [w.decode(\"utf-8\") for w in v])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 33390fc08e..9ca833e27b 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -52,6 +52,22 @@ >>> print(trigram[bigram[sent]]) [u'the', u'new_york_times', u'is', u'a', u'newspaper'] +The common_terms parameter add a way to give special treatment to common terms (aka stop words) +such that their presence between two words +won't prevent bigram detection. +It allows to detect expressions like "bank of america" or "eye of the beholder". + +>>> common_terms = ["of", "with", "without", "and", "or", "the", "a"] +>>> ct_phrases = Phrases(sentence_stream, common_terms=common_terms) + +The phraser will of course inherit the common_terms from Phrases. + +>>> ct_bigram = Phraser(ct_phrases) +>>> sent = [u'the', u'mayor', u'shows', u'his', u'lack', u'of', u'interest'] +>>> print(bigram[sent]) +[u'the', u'mayor', u'shows', u'his', u'lack_of_interest'] + + .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. @@ -98,7 +114,53 @@ def _is_single(obj): return False, obj_iter -class Phrases(interfaces.TransformationABC): +class SentenceAnalyzer: + + def analyze_sentence(self, sentence, threshold, common_terms, scoring): + """Analyze a sentence + + `sentence` a token list representing the sentence to be analyzed. + + `threshold` the minimum score for a bigram to be taken into account + + `common_terms` the list of common terms, they have a special treatment + + `scoring` a scoring function + taking as parameters a first word, a second, the components of an eventual bigram + and returning the score. + """ + s = [utils.any2utf8(w) for w in sentence] + last_uncommon = None + in_between = [] + # adding None is a trick that helps getting an automatic happy ending + # has it won't be a common_word, nor score + for word in s + [None]: + is_common = word in common_terms + if not is_common and last_uncommon: + chain = [last_uncommon] + in_between + [word] + # test between last_uncommon + score = scoring(last_uncommon, word, chain) + if score > threshold: + yield (chain, score) + last_uncommon = None + in_between = [] + else: + # release words individually + for w in it.chain([last_uncommon], in_between): + yield (w, None) + in_between = [] + last_uncommon = word + elif not is_common: + last_uncommon = word + else: # common term + if last_uncommon: + # wait for uncommon resolution + in_between.append(word) + else: + yield (word, None) + + +class Phrases(SentenceAnalyzer, interfaces.TransformationABC): """ Detect phrases, based on collected collocation counts. Adjacent words that appear together more frequently than expected are joined together with the `_` character. @@ -109,7 +171,7 @@ class Phrases(interfaces.TransformationABC): """ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, - scoring='default'): + scoring='default', common_terms=frozenset()): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. @@ -149,6 +211,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, 'npmi' is more robust when dealing with common words that form part of common bigrams, and ranges from -1 to 1, but is slower to calculate than the default + `common_terms` is an optionnal list of "stop words" that won't affect frequency count + of expressions containing them. """ if min_count <= 0: raise ValueError("min_count should be at least 1") @@ -170,6 +234,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.progress_per = progress_per self.scoring = scoring self.corpus_word_count = 0 + self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms) if sentences is not None: self.add_vocab(sentences) @@ -181,7 +246,8 @@ def __str__(self): self.threshold, self.max_vocab_size) @staticmethod - def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): + def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, + common_terms=frozenset()): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 @@ -192,15 +258,19 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): if sentence_no % progress_per == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) - sentence = [utils.any2utf8(w) for w in sentence] - for bigram in zip(sentence, sentence[1:]): - vocab[bigram[0]] += 1 - vocab[delimiter.join(bigram)] += 1 - total_words += 1 - - if sentence: # add last word skipped by previous loop - word = sentence[-1] - vocab[word] += 1 + s = [utils.any2utf8(w) for w in sentence] + last_uncommon = None + in_between = [] + for word in s: + if word not in common_terms: + vocab[word] += 1 + if last_uncommon is not None: + components = it.chain([last_uncommon], in_between, [word]) + vocab[delimiter.join(components)] += 1 + last_uncommon = word + in_between = [] + elif last_uncommon is not None: + in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: @@ -221,8 +291,8 @@ def add_vocab(self, sentences): # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. - min_reduce, vocab, total_words = \ - self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + min_reduce, vocab, total_words = self.learn_vocab( + sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms) self.corpus_word_count += total_words if len(self.vocab) > 0: @@ -239,6 +309,56 @@ def add_vocab(self, sentences): logger.info("using %i counts as vocab in %s", len(vocab), self) self.vocab = vocab + @staticmethod + def original_scorer(word_a, word_b, components, + vocab, delimiter, len_vocab=0.0, min_count=0.0): + """Compute score for a bigram, following original mikolov word2vec paper + + all parameters but the first three should be fixed (thanks to `functools.partial`) + before using it as a score function to `analyze_sentence` + """ + if word_a in vocab and word_b in vocab: + bigram = delimiter.join(components) + pa = float(vocab[word_a]) + pb = float(vocab[word_b]) + pab = float(vocab[bigram]) + return (pab - min_count) / pa / pb * len_vocab + else: + return -1 + + @staticmethod + def npmi_scorer(word_a, word_b, components, + vocab, delimiter, corpus_word_count=0.0): + """normalized PMI + + all parameters but the first three should be fixed (thanks to `functools.partial`) + before using it as a score function to `analyze_sentence` + """ + if word_a in vocab and word_b in vocab: + bigram = delimiter.join(components) + pa = float(vocab[word_a]) / corpus_word_count + pb = float(vocab[word_b]) / corpus_word_count + pab = float(vocab[bigram]) / corpus_word_count + return log(pab / (pa * pb)) / -log(pab) + else: + return -1 + + def get_scoring_function(self): + if self.scoring == 'default': + scoring_function = partial( + self.original_scorer, + vocab=self.vocab, + delimiter=self.delimiter, + len_vocab=float(len(self.vocab)), + min_count=float(self.min_count)) + elif self.scoring == 'npmi': + scoring_function = partial( + self.npmi_scorer, + vocab=self.vocab, + delimiter=self.delimiter, + corpus_word_count=self.corpus_word_count) + return scoring_function + def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): """ Generate an iterator that contains all phrases in given 'sentences' @@ -252,47 +372,20 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): then you can debug the threshold with generated tsv """ - - vocab = self.vocab - threshold = self.threshold - delimiter = self.delimiter # delimiter used for lookup - min_count = self.min_count - scoring = self.scoring - corpus_word_count = self.corpus_word_count - - if scoring == 'default': - scoring_function = \ - partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) - elif scoring == 'npmi': - scoring_function = \ - partial(self.npmi_scorer, corpus_word_count=corpus_word_count) - # no else here to catch unknown scoring function, check is done in Phrases.__init__ - + analyze_sentence = partial( + self.analyze_sentence, + threshold=self.threshold, + common_terms=self.common_terms, + scoring=self.get_scoring_function()) for sentence in sentences: - s = [utils.any2utf8(w) for w in sentence] - last_bigram = False - - for word_a, word_b in zip(s, s[1:]): - # last bigram check was moved here to save a few CPU cycles - if word_a in vocab and word_b in vocab and not last_bigram: - bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab: - count_a = float(vocab[word_a]) - count_b = float(vocab[word_b]) - count_ab = float(vocab[bigram_word]) - score = scoring_function(count_a, count_b, count_ab) - # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - # added mincount check because if the scorer doesn't contain min_count - # it would not be enforced otherwise - if score > threshold and count_ab >= min_count: - if as_tuples: - yield ((word_a, word_b), score) - else: - yield (out_delimiter.join((word_a, word_b)), score) - last_bigram = True - continue - last_bigram = False + bigrams = analyze_sentence(sentence) + # keeps only not None scores + filtered = ((words, score) for words, score in bigrams if score is not None) + for words, score in filtered: + if as_tuples: + yield (tuple(words), score) + else: + yield (out_delimiter.join(words), score) def __getitem__(self, sentence): """ @@ -321,64 +414,38 @@ def __getitem__(self, sentence): # return an iterable stream. return self._apply(sentence) - s, new_s = [utils.any2utf8(w) for w in sentence], [] - last_bigram = False - vocab = self.vocab - threshold = self.threshold delimiter = self.delimiter - min_count = self.min_count - for word_a, word_b in zip(s, s[1:]): - if word_a in vocab and word_b in vocab: - bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab and not last_bigram: - pa = float(vocab[word_a]) - pb = float(vocab[word_b]) - pab = float(vocab[bigram_word]) - score = (pab - min_count) / pa / pb * len(vocab) - # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - if score > threshold: - new_s.append(bigram_word) - last_bigram = True - continue - - if not last_bigram: - new_s.append(word_a) - last_bigram = False - - if s: # add last word skipped by previous loop - last_token = s[-1] - if not last_bigram: - new_s.append(last_token) - + bigrams = self.analyze_sentence( + sentence, + threshold=self.threshold, + common_terms=self.common_terms, + scoring=self.get_scoring_function()) + new_s = [] + for words, score in bigrams: + if score is not None: + words = delimiter.join(words) + new_s.append(words) return [utils.to_unicode(w) for w in new_s] - # calculation of score based on original mikolov word2vec paper - # len_vocab and min_count set so functools.partial works - @staticmethod - def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0): - return (bigram_count - min_count) / worda_count / wordb_count * len_vocab - - # normalized PMI, requires corpus size - @staticmethod - def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0): - pa = worda_count / corpus_word_count - pb = wordb_count / corpus_word_count - pab = bigram_count / corpus_word_count - return log(pab / (pa * pb)) / -log(pab) - -def pseudocorpus(source_vocab, sep): +def pseudocorpus(source_vocab, sep, common_terms=frozenset()): """Feeds source_vocab's compound keys back to it, to discover phrases""" for k in source_vocab: if sep not in k: continue unigrams = k.split(sep) for i in range(1, len(unigrams)): - yield [sep.join(unigrams[:i]), sep.join(unigrams[i:])] + if unigrams[i-1] not in common_terms: + # do not join common terms + cterms = list(it.takewhile(lambda w: w in common_terms, unigrams[i:])) + tail = unigrams[i + len(cterms):] + components = [sep.join(unigrams[:i])] + cterms + if tail: + components.append(sep.join(tail)) + yield components -class Phraser(interfaces.TransformationABC): +class Phraser(SentenceAnalyzer, interfaces.TransformationABC): """ Minimal state & functionality to apply results of a Phrases model to tokens. @@ -395,8 +462,9 @@ def __init__(self, phrases_model): self.min_count = phrases_model.min_count self.delimiter = phrases_model.delimiter self.scoring = phrases_model.scoring + self.common_terms = phrases_model.common_terms + corpus = self.pseudocorpus(phrases_model) self.phrasegrams = {} - corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter) logger.info('source_vocab length %i', len(phrases_model.vocab)) count = 0 for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): @@ -408,6 +476,19 @@ def __init__(self, phrases_model): logger.info('Phraser added %i phrasegrams', count) logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams)) + def pseudocorpus(self, phrases_model): + return pseudocorpus(phrases_model.vocab, phrases_model.delimiter, + phrases_model.common_terms) + + @staticmethod + def scorer(word_a, word_b, components, vocab): + """score is retained from original dataset + """ + try: + return vocab[tuple(components)][1] + except KeyError: + return -1 + def __getitem__(self, sentence): """ Convert the input tokens `sentence` (=list of unicode strings) into phrase @@ -425,27 +506,18 @@ def __getitem__(self, sentence): # return an iterable stream. return self._apply(sentence) - s, new_s = [utils.any2utf8(w) for w in sentence], [] - last_bigram = False - phrasegrams = self.phrasegrams delimiter = self.delimiter - for word_a, word_b in zip(s, s[1:]): - bigram_tuple = (word_a, word_b) - if phrasegrams.get(bigram_tuple, (-1, -1))[1] > self.threshold and not last_bigram: - bigram_word = delimiter.join((word_a, word_b)) - new_s.append(bigram_word) - last_bigram = True - continue - - if not last_bigram: - new_s.append(word_a) - last_bigram = False - - if s: # add last word skipped by previous loop - last_token = s[-1] - if not last_bigram: - new_s.append(last_token) - + scoring_function = partial(self.scorer, vocab=self.phrasegrams) + bigrams = self.analyze_sentence( + sentence, + threshold=self.threshold, + common_terms=self.common_terms, + scoring=scoring_function) + new_s = [] + for words, score in bigrams: + if score is not None: + words = delimiter.join(words) + new_s.append(words) return [utils.to_unicode(w) for w in new_s] diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 688f92dbd0..7fd1d9e04d 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -14,7 +14,7 @@ import sys from gensim import utils -from gensim.models.phrases import Phrases, Phraser +from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser, pseudocorpus if sys.version_info[0] >= 3: unicode = str @@ -23,32 +23,153 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) -sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], - ['graph', 'minors', 'survey','human','interface'] #test bigrams within same sentence -] -unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] - - -def gen_sentences(): - return ((w for w in sentence) for sentence in sentences) - - -class TestPhrasesCommon(unittest.TestCase): +class TestUtils(unittest.TestCase): + + def test_pseudocorpus_no_common_terms(self): + vocab = [ + "prime_minister", + "gold", + "chief_technical_officer", + "effective"] + result = list(pseudocorpus(vocab, "_")) + self.assertEqual( + result, + [["prime", "minister"], + ["chief", "technical_officer"], + ["chief_technical", "officer"]]) + + def test_pseudocorpus_with_common_terms(self): + vocab = [ + "hall_of_fame", + "gold", + "chief_of_political_bureau", + "effective", + "beware_of_the_dog_in_the_yard"] + common_terms=frozenset(["in", "the", "of"]) + result = list(pseudocorpus(vocab, "_", common_terms=common_terms)) + self.assertEqual( + result, + [["hall", "of", "fame"], + ["chief", "of", "political_bureau"], + ["chief_of_political", "bureau"], + ["beware", "of", "the", "dog_in_the_yard"], + ["beware_of_the_dog", "in", "the", "yard"]]) + + +class TestPhraseAnalysis(unittest.TestCase): + + class AnalysisTester(SentenceAnalyzer): + + def __init__(self, scores): + self.scores = scores + + def scorer(self, word_a, word_b, components): + if word_a is not None and word_b is not None: + bigram_word = b"_".join(components) + return self.scores.get(bigram_word, -1) + else: + return -1 + + def analyze(self, scores, sentence): + analyzer = self.AnalysisTester(scores) + return list(analyzer.analyze_sentence( + sentence, + threshold=1, + common_terms={b"a", b"the", b"with", b"of"}, + scoring=analyzer.scorer)) + + def analyze_words(self, scores, sentence): + result = ( + w if isinstance(w, (tuple, list)) else [w] + for w, score in self.analyze(scores, sentence)) + return [b"_".join(w).decode("utf-8") for w in result] + + def test_simple_analysis(self): + s = ["simple", "sentence", "should", "pass"] + result = self.analyze_words({}, s) + self.assertEqual(result, s) + s = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"] + result = self.analyze_words({}, s) + self.assertEqual(result, s) + + def test_analysis_bigrams(self): + scores = { + b"simple_sentence": 2, b"sentence_many": 2, + b"many_possible": 2, b"possible_bigrams": 2} + s = ["simple", "sentence", "many", "possible", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"]) + + s = ["some", "simple", "sentence", "many", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"]) + + s = ["some", "unrelated", "simple", "words"] + result = self.analyze_words(scores, s) + self.assertEqual(result, s) + + def test_analysis_common_terms(self): + scores = { + b"simple_sentence": 2, b"sentence_many": 2, + b"many_possible": 2, b"possible_bigrams": 2} + s = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"]) + + s = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"] + result = self.analyze_words(scores, s) + self.assertEqual(result, [ + "simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"]) + + def test_analysis_common_terms_in_between(self): + scores = { + b"simple_sentence": 2, b"sentence_with_many": 2, + b"many_possible":2, b"many_of_the_possible": 2, b"possible_bigrams": 2} + s = ["sentence", "with", "many", "possible", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["sentence_with_many", "possible_bigrams"]) + + s = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"] + result = self.analyze_words(scores, s) + self.assertEqual( + result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"]) + + +class PhrasesData: + sentences = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ['graph', 'minors', 'survey','human','interface'] #test bigrams within same sentence + ] + unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] + common_terms = frozenset() + + bigram1 = u'response_time' + bigram2 = u'graph_minors' + bigram3 = u'human_interface' + + def gen_sentences(self): + return ((w for w in sentence) for sentence in self.sentences) + + +class PhrasesCommon: """ Tests that need to be run for both Prases and Phraser classes.""" def setUp(self): - self.bigram = Phrases(sentences, min_count=1, threshold=1) - self.bigram_default = Phrases(sentences) - self.bigram_utf8 = Phrases(sentences, min_count=1, threshold=1) - self.bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1) + self.bigram = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + self.bigram_default = Phrases( + self.sentences, common_terms=self.common_terms) + self.bigram_utf8 = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + self.bigram_unicode = Phrases( + self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) def testEmptyInputsOnBigramConstruction(self): """Test that empty inputs don't throw errors and return the expected result.""" @@ -66,23 +187,22 @@ def testEmptyInputsOnBigramConstruction(self): def testSentenceGeneration(self): """Test basic bigram using a dummy corpus.""" # test that we generate the same amount of sentences as the input - self.assertEqual(len(sentences), len(list(self.bigram_default[sentences]))) + self.assertEqual(len(self.sentences), len(list(self.bigram_default[self.sentences]))) def testSentenceGenerationWithGenerator(self): """Test basic bigram production when corpus is a generator.""" - self.assertEqual(len(list(gen_sentences())), - len(list(self.bigram_default[gen_sentences()]))) + self.assertEqual(len(list(self.gen_sentences())), + len(list(self.bigram_default[self.gen_sentences()]))) def testBigramConstruction(self): """Test Phrases bigram construction building.""" # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False - - for s in self.bigram[sentences]: - if not bigram1_seen and u'response_time' in s: + for s in self.bigram[self.sentences]: + if not bigram1_seen and self.bigram1 in s: bigram1_seen = True - if not bigram2_seen and u'graph_minors' in s: + if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break @@ -91,21 +211,21 @@ def testBigramConstruction(self): # check the same thing, this time using single doc transformation # last sentence should contain both graph_minors and human_interface - self.assertTrue(u'response_time' in self.bigram[sentences[1]]) - self.assertTrue(u'response_time' in self.bigram[sentences[4]]) - self.assertTrue(u'graph_minors' in self.bigram[sentences[-2]]) - self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]]) - self.assertTrue(u'human_interface' in self.bigram[sentences[-1]]) + self.assertTrue(self.bigram1 in self.bigram[self.sentences[1]]) + self.assertTrue(self.bigram1 in self.bigram[self.sentences[4]]) + self.assertTrue(self.bigram2 in self.bigram[self.sentences[-2]]) + self.assertTrue(self.bigram2 in self.bigram[self.sentences[-1]]) + self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]]) def testBigramConstructionFromGenerator(self): """Test Phrases bigram construction building when corpus is a generator""" bigram1_seen = False bigram2_seen = False - for s in self.bigram[gen_sentences()]: - if not bigram1_seen and 'response_time' in s: + for s in self.bigram[self.gen_sentences()]: + if not bigram1_seen and self.bigram1 in s: bigram1_seen = True - if not bigram2_seen and 'graph_minors' in s: + if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break @@ -115,33 +235,33 @@ def testEncoding(self): """Test that both utf8 and unicode input work; output must be unicode.""" expected = [u'survey', u'user', u'computer', u'system', u'response_time'] - self.assertEqual(self.bigram_utf8[sentences[1]], expected) - self.assertEqual(self.bigram_unicode[sentences[1]], expected) + self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) + self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) - transformed = ' '.join(self.bigram_utf8[sentences[1]]) + transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) self.assertTrue(isinstance(transformed, unicode)) -class TestPhrasesModel(unittest.TestCase): +class TestPhrasesModel(PhrasesData, PhrasesCommon, unittest.TestCase): + def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() - for phrase, score in bigram.export_phrases(sentences): + for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'response time', b'graph minors', - b'human interface' + b'human interface', ]) def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ - bigram = Phrases(sentences, min_count=1, threshold=1) - + bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] @@ -155,7 +275,7 @@ def testMultipleBigramsSingleEntry(self): def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_scores = set() @@ -170,7 +290,7 @@ def testScoringDefault(self): def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ - bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') + bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() @@ -186,35 +306,166 @@ def testScoringNpmi(self): def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0 - self.assertRaises(ValueError, Phrases, sentences, min_count=0) + self.assertRaises(ValueError, Phrases, self.sentences, min_count=0) # threshold should be positive - self.assertRaises(ValueError, Phrases, sentences, threshold=-1) + self.assertRaises(ValueError, Phrases, self.sentences, threshold=-1) def testPruning(self): """Test that max_vocab_size parameter is respected.""" - bigram = Phrases(sentences, max_vocab_size=5) + bigram = Phrases(self.sentences, max_vocab_size=5) self.assertTrue(len(bigram.vocab) <= 5) #endclass TestPhrasesModel -class TestPhraserModel(TestPhrasesCommon): +class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): """ Test Phraser models.""" def setUp(self): """Set up Phraser models for the tests.""" - bigram_phrases = Phrases(sentences, min_count=1, threshold=1) + bigram_phrases = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram = Phraser(bigram_phrases) - bigram_default_phrases = Phrases(sentences) + bigram_default_phrases = Phrases(self.sentences, common_terms=self.common_terms) self.bigram_default = Phraser(bigram_default_phrases) - bigram_utf8_phrases = Phrases(sentences, min_count=1, threshold=1) + bigram_utf8_phrases = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_utf8 = Phraser(bigram_utf8_phrases) - bigram_unicode_phrases = Phrases(unicode_sentences, min_count=1, threshold=1) + bigram_unicode_phrases = Phrases( + self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_unicode = Phraser(bigram_unicode_phrases) + +class CommonTermsPhrasesData: + """This mixin permits to reuse the test, using, this time the common_terms option + """ + + sentences = [ + ['human', 'interface', 'with', 'computer'], + ['survey', 'of', 'user', 'computer', 'system', 'lack', 'of', 'interest'], + ['eps', 'user', 'interface', 'system'], + ['system', 'and', 'human', 'system', 'eps'], + ['user', 'lack', 'of', 'interest'], + ['trees'], + ['graph', 'of', 'trees'], + ['data', 'and', 'graph', 'of', 'trees'], + ['data', 'and', 'graph', 'survey'], + ['data', 'and', 'graph', 'survey', 'for', 'human','interface'] #test bigrams within same sentence + ] + unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] + common_terms = ['of', 'and', 'for'] + + bigram1 = u'lack_of_interest' + bigram2 = u'data_and_graph' + bigram3 = u'human_interface' + expression1 = u'lack of interest' + expression2 = u'data and graph' + expression3 = u'human interface' + + def gen_sentences(self): + return ((w for w in sentence) for sentence in self.sentences) + + +class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel): + """Test Phrases models with common terms""" + + def testEncoding(self): + """Test that both utf8 and unicode input work; output must be unicode.""" + expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] + + self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) + self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) + + transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) + self.assertTrue(isinstance(transformed, unicode)) + + def testMultipleBigramsSingleEntry(self): + """ a single entry should produce multiple bigrams. """ + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + + seen_bigrams = set() + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human','interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_bigrams.add(phrase) + assert seen_bigrams == set([ + b'data and graph', + b'human interface', + ]) + + def testExportPhrases(self): + """Test Phrases bigram export_phrases functionality.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + + seen_bigrams = set() + + for phrase, score in bigram.export_phrases(self.sentences): + seen_bigrams.add(phrase) + + assert seen_bigrams == set([ + b'human interface', + b'graph of trees', + b'data and graph', + b'lack of interest', + ]) + + def testScoringDefault(self): + """ test the default scoring, from the mikolov word2vec paper """ + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + + seen_scores = set() + + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human','interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + min_count = float(bigram.min_count) + len_vocab = float(len(bigram.vocab)) + graph = bigram.vocab["graph"] + data = bigram.vocab["data"] + data_and_graph = bigram.vocab["data_and_graph"] + human = bigram.vocab["human"] + interface = bigram.vocab["interface"] + human_interface = bigram.vocab["human_interface"] + + assert seen_scores == set([ + # score for data and graph + round((data_and_graph - min_count) / data / graph * len_vocab, 3), + # score for human interface + round((human_interface - min_count) / human / interface * len_vocab, 3), + ]) + + def testScoringNpmi(self): + """ test normalized pointwise mutual information scoring """ + bigram = Phrases(self.sentences, min_count=1, threshold=.5, + scoring='npmi', common_terms=self.common_terms) + + seen_scores = set() + + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human','interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + .74, # score for data and graph + .894 # score for human interface + ]) + +class TestPhraserModelCommonTerms(CommonTermsPhrasesData, TestPhraserModel): + + def testEncoding(self): + """Test that both utf8 and unicode input work; output must be unicode.""" + expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] + + self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) + self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) + + transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) + self.assertTrue(isinstance(transformed, unicode)) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()