Add common terms to phrases model. Fix #1258 (#1568)

* adding common terms to phrases model * fix pep 8 and fix python3 compat issue in test * small fix in Phraser for python 2.7 compat * fix on phrases score_item to keep previous behaviour * avoid rounding problem on a test (occured on windows x64 + python3.5 in appveyor)
piskvorky · Nov 1, 2017 · b4515e0 · b4515e0
1 parent d7190f3
commit b4515e0
Show file tree

Hide file tree

Showing 4 changed files with 919 additions and 177 deletions.
diff --git a/docs/notebooks/wikinews-bigram-en.ipynb b/docs/notebooks/wikinews-bigram-en.ipynb
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Illustrating common terms usage using Wikinews in english"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## getting data\n",
+    "\n",
+    "We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "LANG=\"english\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "fdate=20170327\n",
+    "fname=enwikinews-$fdate-cirrussearch-content.json.gz\n",
+    "if [ ! -e  $fname ]\n",
+    "then\n",
+    "    wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\n",
+    "fi\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# iterator\n",
+    "import gzip\n",
+    "import json\n",
+    "\n",
+    "FDATE = 20170327\n",
+    "FNAME = \"enwikinews-%s-cirrussearch-content.json.gz\" % FDATE\n",
+    "\n",
+    "def iter_texts(fpath=FNAME):\n",
+    "    with gzip.open(fpath, \"rt\") as f:\n",
+    "        for l in f:\n",
+    "            data = json.loads(l)\n",
+    "            if \"title\" in data:\n",
+    "                yield data[\"title\"]\n",
+    "                yield data[\"text\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# also prepare nltk\n",
+    "import nltk\n",
+    "nltk.download(\"punkt\")\n",
+    "nltk.download(\"stopwords\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparing data\n",
+    "\n",
+    "we arrange the corpus as required by gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# make a custom tokenizer\n",
+    "import re\n",
+    "from nltk.tokenize import sent_tokenize\n",
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "tokenizer = RegexpTokenizer('\\w[\\w-]*|\\d[\\d,]*')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# prepare a text\n",
+    "def prepare(txt):\n",
+    "    # lower case\n",
+    "    txt = txt.lower()\n",
+    "    return [tokenizer.tokenize(sent) \n",
+    "            for sent in sent_tokenize(txt, language=LANG)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# we put all data in ram, it's not so much\n",
+    "corpus = []\n",
+    "for txt in iter_texts():\n",
+    "    corpus.extend(prepare(txt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus has 1003521 words in 46159 sentences\n"
+     ]
+    }
+   ],
+   "source": [
+    "# how many sentences and words ?\n",
+    "words_count = sum(len(s) for s in corpus)\n",
+    "print(\"Corpus has %d words in %d sentences\" % (words_count, len(corpus)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Testing bigram with and without common terms\n",
+    "\n",
+    "The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them.\n",
+    "While you could remove them, you may information, for *\"the president is in america\"* is not the same as *\"the president of america\"*\n",
+    "\n",
+    "The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim.models.phrases import Phrases"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now d ll m o re ve y ain aren couldn didn doesn hadn hasn haven isn ma mightn mustn needn shan shouldn wasn weren won wouldn'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# which are the stop words we will use\n",
+    "from nltk.corpus import stopwords\n",
+    "\" \".join(stopwords.words(LANG))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# a version of corups without stop words\n",
+    "stop_words = frozenset(stopwords.words(LANG))\n",
+    "def stopwords_filter(txt):\n",
+    "    return [w for w in txt if w not in stop_words]\n",
+    "st_corpus = [stopwords_filter(txt) for txt in corpus]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.33 s, sys: 16 ms, total: 1.34 s\n",
+      "Wall time: 1.34 s\n",
+      "CPU times: user 1.64 s, sys: 24 ms, total: 1.67 s\n",
+      "Wall time: 1.67 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "# bigram std\n",
+    "%time bigram = Phrases(st_corpus)\n",
+    "# bigram with common terms\n",
+    "%time bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### bigram with common terms inside\n",
+    "\n",
+    "What are (some of) the bigram founds thanks to common terms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "510 grams with common terms found\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[(5339.47619047619, 'borussia m gladbach'),\n",
+       " (5460.194782608696, 'billboard in jakarta'),\n",
+       " (5606.450000000001, 'christ of latter-day'),\n",
+       " (5862.954248366013, 'skull and bones'),\n",
+       " (6006.910714285714, 'preserved in amber'),\n",
+       " (6129.452168746287, 'aisyah and doan'),\n",
+       " (6158.114416475973, 'funded by your generous'),\n",
+       " (6407.371428571429, 'restored as burkina'),\n",
+       " (7081.831578947369, 'click on the donate'),\n",
+       " (7234.129032258064, 'qatar of intervening'),\n",
+       " (7377.621673923561, 'sinks in suva'),\n",
+       " (8146.123931623933, 'lahm to hang'),\n",
+       " (8163.0819009100105, 'istanbul s ataturk'),\n",
+       " (8305.851851851852, 'derails in tabasco'),\n",
+       " (9060.929292929293, 'poet of apostasy'),\n",
+       " (9593.925133689841, 'creator of kinder'),\n",
+       " (10512.09375, 'consulate in irbil'),\n",
+       " (12176.904977375565, 'newsworthy and entertaining'),\n",
+       " (15829.976470588235, 'santos over nepotism'),\n",
+       " (16272.689342403628, 'hotness of bhut')]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# grams that have more than 2 terms, are those with common terms\n",
+    "ct_ngrams = set((g[1], g[0].decode(\"utf-8\"))\n",
+    "                     for g in bigram_ct.export_phrases(corpus) \n",
+    "                     if len(g[0].split()) > 2)\n",
+    "ct_ngrams = sorted(list(ct_ngrams))\n",
+    "print(len(ct_ngrams), \"grams with common terms found\")\n",
+    "# highest scores\n",
+    "ct_ngrams[-20:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "location-united  :  ['location of the united', 'location of united']\n",
+      "magnitude-6  :  ['magnitude 6', 'magnitude of 6']\n",
+      "tuition-fees  :  ['tuition and fees', 'tuition fees']\n",
+      "pleaded-guilty  :  ['pleaded not guilty', 'pleaded guilty']\n",
+      "found-guilty  :  ['found not guilty', 'found guilty']\n",
+      "france-germany  :  ['france germany', 'france and germany']\n",
+      "earlier-week  :  ['earlier this week', 'earlier in the week']\n",
+      "since-2003  :  ['since 2003', 'since the 2003']\n",
+      "contact-admissions  :  ['contact the admissions', 'contact admissions']\n",
+      "created-text  :  ['created from text', 'created from the text']\n",
+      "external-inter-wiki  :  ['external and inter-wiki', 'external inter-wiki']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# did we found any bigram with same words but different stopwords\n",
+    "import collections\n",
+    "by_terms = collections.defaultdict(set)\n",
+    "for ngram, score in bigram_ct.export_phrases(corpus):\n",
+    "    grams = ngram.split()\n",
+    "    by_terms[(grams[0], grams[-1])].add(ngram)\n",
+    "for k, v  in by_terms.items():\n",
+    "    if len(v) > 1:\n",
+    "        print(b\"-\".join(k).decode(\"utf-8\"),\" : \", [w.decode(\"utf-8\") for w in v])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}