piskvorky · prakhar2b · Jun 14, 2017 · Jun 14, 2017 · Jun 21, 2017 · Jun 21, 2017
diff --git a/docs/notebooks/phrases-optimization-benchmark.ipynb b/docs/notebooks/phrases-optimization-benchmark.ipynb
@@ -0,0 +1,373 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Performance improvement in Phrases module\n",
+    "\n",
+    "#### Author - Prakhar Pratyush (@prakhar2b)\n",
+    "[Google summer of code '17 live blog](https://rare-technologies.com/google-summer-of-code-2017-live-blog-performance-improvement-in-gensim-and-fasttext/)\n",
+    "\n",
+    "| Optimization       | Python 2.7     | Python 3.6 | PR |\n",
+    "| ------------- |:-------------:| :------------:|\n",
+    "|    original  | ~ 36-38 sec | ~32-34 sec |\n",
+    "|recode_to_utf8=False| ~19-21 sec      | ~20-22 sec    | [#1413](https://github.com/RaRe-Technologies/gensim/pull/1413)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Python 3.6.1 :: Anaconda 4.4.0 (64-bit)\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! python --version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-06-29 13:19:26,967 : INFO : 'pattern' package not found; tag filters are not available for English\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
+    "\n",
+    "import profile\n",
+    "%load_ext autoreload\n",
+    "\n",
+    "import gensim\n",
+    "from gensim.models.word2vec import Text8Corpus\n",
+    "%autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#! git clone https://github.com/prakhar2b/gensim.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/prakhar\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#! wget http://mattmahoney.net/dc/text8.zip "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#! unzip text8.zip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/prakhar/text8\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "text8_file = os.path.abspath('text8')\n",
+    "print(text8_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/prakhar/gensim\n"
+     ]
+    }
+   ],
+   "source": [
+    "% cd gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Already on 'develop'\r\n",
+      "Your branch is up-to-date with 'origin/develop'.\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!git checkout develop\n",
+    "%autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! python setup.py install\n",
+    "%autoreload "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-06-29 13:13:36,521 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:13:36,524 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:14:09,283 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:14:09,284 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:14:09,385 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:14:09,387 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:14:41,861 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:14:41,863 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:14:41,974 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:14:41,976 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:15:15,134 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:15:15,135 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:15:15,238 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:15:15,240 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:15:54,512 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:15:54,513 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:15:54,612 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:15:54,615 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:16:30,985 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:16:30,986 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 33.2 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "# currently on develop --- original code\n",
+    "from gensim.models import Phrases\n",
+    "bigram = Phrases(Text8Corpus(text8_file))\n",
+    "%timeit bigram = Phrases(Text8Corpus(text8_file))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Switched to branch 'any2utf8'\r\n",
+      "Your branch is up-to-date with 'origin/any2utf8'.\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "%autoreload \n",
+    "! git checkout any2utf8\n",
+    "%autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! python setup.py install\n",
+    "%autoreload "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-06-29 13:19:38,063 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:19:38,074 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:20:08,504 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:20:08,505 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:20:08,508 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:20:08,512 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:20:40,463 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:20:40,464 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:20:40,546 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:20:40,549 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:21:16,204 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:21:16,205 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:21:16,305 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:21:16,308 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:21:53,123 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:21:53,124 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:21:53,229 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:21:53,232 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:22:27,118 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:22:27,119 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 33.9 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "# currently on any2utf8 \n",
+    "from gensim.models import Phrases\n",
+    "bigram = Phrases(Text8Corpus(text8_file))\n",
+    "%timeit bigram = Phrases(Text8Corpus(text8_file))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-06-29 13:25:04,268 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:25:04,275 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:25:26,068 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:25:26,070 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:25:26,187 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:25:26,189 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:25:47,507 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:25:47,508 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:25:47,621 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:25:47,625 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:26:09,264 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:26:09,266 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:26:09,386 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:26:09,389 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:26:30,828 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:26:30,829 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n",
+      "2017-06-29 13:26:30,947 : INFO : collecting all words and their counts\n",
+      "2017-06-29 13:26:30,950 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n",
+      "2017-06-29 13:26:52,900 : INFO : collected 4400410 word types from a corpus of 17003506 words (unigram + bigrams) and 1701 sentences\n",
+      "2017-06-29 13:26:52,901 : INFO : using 4400410 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 21.4 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim.models import Phrases\n",
+    "bigram = Phrases(Text8Corpus(text8_file), recode_to_utf8= False)\n",
+    "%timeit bigram = Phrases(Text8Corpus(text8_file), recode_to_utf8= False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}