Replace open() with smart_open() in notebooks. Fix #1789 (#1812)

* Replace open() with smart_open() * Fix in Corpora_and_Vector_Spaces.ipynb * Specify read/write explicitly while calling smart_open()
piskvorky · Mar 10, 2018 · 17f3719 · gojomo · Mar 12, 2018 · menshikh-iv
1 parent 2ee7a2c
commit 17f3719
Show file tree

Hide file tree

Showing 15 changed files with 53 additions and 35 deletions.
diff --git a/docs/notebooks/Corpora_and_Vector_Spaces.ipynb b/docs/notebooks/Corpora_and_Vector_Spaces.ipynb
@@ -279,9 +279,10 @@
    },
    "outputs": [],
    "source": [
+    "from smart_open import smart_open\n",
     "class MyCorpus(object):\n",
     "    def __iter__(self):\n",
-    "        for line in open('datasets/mycorpus.txt'):\n",
+    "        for line in smart_open('datasets/mycorpus.txt', 'rb'):\n",
     "            # assume there's one document per line, tokens separated by whitespace\n",
     "            yield dictionary.doc2bow(line.lower().split())"
    ]
@@ -374,9 +375,10 @@
    ],
    "source": [
     "from six import iteritems\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# collect statistics about all tokens\n",
-    "dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
+    "dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n",
     "\n",
     "# remove stop words and words that appear only once\n",
     "stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",

diff --git a/docs/notebooks/Poincare Evaluation.ipynb b/docs/notebooks/Poincare Evaluation.ipynb
@@ -697,7 +697,7 @@
     "    parts = first_line.rstrip().split(\"\\t\")\n",
     "    model_size = len(parts) - 1\n",
     "    vocab_size = len(lines)\n",
-    "    with open(output_file, 'w') as f:\n",
+    "    with smart_open(output_file, 'w') as f:\n",
     "        f.write('%d %d\\n' % (vocab_size, model_size))\n",
     "        for line in lines:\n",
     "            f.write(line.replace('\\t', ' '))\n",
@@ -709,7 +709,7 @@
     "    \n",
     "    model_size = random_embedding.shape[0]\n",
     "    vocab_size = len(np_embeddings)\n",
-    "    with open(output_file, 'w') as f:\n",
+    "    with smart_open(output_file, 'w') as f:\n",
     "        f.write('%d %d\\n' % (vocab_size, model_size))\n",
     "        for key, vector in np_embeddings.items():\n",
     "            vector_string = ' '.join('%.6f' % value for value in vector)\n",
@@ -1113,7 +1113,7 @@
     "    test_line_candidates = []\n",
     "    line_count = 0\n",
     "    all_nodes = set()\n",
-    "    with open(data_file, 'rb') as f:\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
     "        for i, line in enumerate(f):\n",
     "            node_1, node_2 = line.split()\n",
     "            all_nodes.update([node_1, node_2])\n",
@@ -1135,9 +1135,9 @@
     "    train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n",
     "    \n",
     "    train_set_nodes = set()\n",
-    "    with open(data_file, 'rb') as f:\n",
-    "        train_file = open(train_filename, 'wb')\n",
-    "        test_file = open(test_filename, 'wb')\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
+    "        train_file = smart_open(train_filename, 'wb')\n",
+    "        test_file = smart_open(test_filename, 'wb')\n",
     "        for i, line in enumerate(f):\n",
     "            if i in train_line_indices:\n",
     "                train_set_nodes.update(line.split())\n",
@@ -1169,13 +1169,13 @@
     "    \"\"\"\n",
     "    root_candidates = set()\n",
     "    leaf_candidates = set()\n",
-    "    with open(data_file, 'rb') as f:\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
     "        for line in f:\n",
     "            nodes = line.split()\n",
     "            root_candidates.update(nodes)\n",
     "            leaf_candidates.update(nodes)\n",
     "    \n",
-    "    with open(data_file, 'rb') as f:\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
     "        for line in f:\n",
     "            node_1, node_2 = line.split()\n",
     "            if node_1 == node_2:\n",

diff --git a/docs/notebooks/Tensorboard_visualizations.ipynb b/docs/notebooks/Tensorboard_visualizations.ipynb
@@ -624,6 +624,7 @@
     "import pandas as pd\n",
     "import smart_open\n",
     "import random\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# read data\n",
     "dataframe = pd.read_csv('movie_plots.csv')\n",
@@ -803,7 +804,7 @@
    },
    "outputs": [],
    "source": [
-    "with open('movie_plot_metadata.tsv','w') as w:\n",
+    "with smart_open('movie_plot_metadata.tsv','w') as w:\n",
     "    w.write('Titles\\tGenres\\n')\n",
     "    for i,j in zip(dataframe.Titles, dataframe.Genres):\n",
     "        w.write(\"%s\\t%s\\n\" % (i,j))"
@@ -1024,14 +1025,14 @@
    "outputs": [],
    "source": [
     "# create file for tensors\n",
-    "with open('doc_lda_tensor.tsv','w') as w:\n",
+    "with smart_open('doc_lda_tensor.tsv','w') as w:\n",
     "    for doc_topics in all_topics:\n",
     "        for topics in doc_topics:\n",
     "            w.write(str(topics[1])+ \"\\t\")\n",
     "        w.write(\"\\n\")\n",
     "        \n",
     "# create file for metadata\n",
-    "with open('doc_lda_metadata.tsv','w') as w:\n",
+    "with smart_open('doc_lda_metadata.tsv','w') as w:\n",
     "    w.write('Titles\\tGenres\\n')\n",
     "    for j, k in zip(dataframe.Titles, dataframe.Genres):\n",
     "        w.write(\"%s\\t%s\\n\" % (j, k))"
@@ -1084,7 +1085,7 @@
     "\n",
     "# overwrite metadata file\n",
     "i=0\n",
-    "with open('doc_lda_metadata.tsv','w') as w:\n",
+    "with smart_open('doc_lda_metadata.tsv','w') as w:\n",
     "    w.write('Titles\\tGenres\\n')\n",
     "    for j,k in zip(dataframe.Titles, dataframe.Genres):\n",
     "        w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n",

diff --git a/docs/notebooks/WMD_tutorial.ipynb b/docs/notebooks/WMD_tutorial.ipynb
@@ -302,6 +302,7 @@
     "start = time()\n",
     "\n",
     "import json\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# Business IDs of the restaurants.\n",
     "ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n",
@@ -310,7 +311,7 @@
     "w2v_corpus = []  # Documents to train word2vec on (all 6 restaurants).\n",
     "wmd_corpus = []  # Documents to run queries against (only one restaurant).\n",
     "documents = []  # wmd_corpus, with no pre-processing (so we can see the original documents).\n",
-    "with open('/data/yelp_academic_dataset_review.json') as data_file:\n",
+    "with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n",
     "    for line in data_file:\n",
     "        json_line = json.loads(line)\n",
     "        \n",

diff --git a/docs/notebooks/Word2Vec_FastText_Comparison.ipynb b/docs/notebooks/Word2Vec_FastText_Comparison.ipynb
@@ -57,11 +57,12 @@
    ],
    "source": [
     "import nltk\n",
+    "from smart_open import smart_open\n",
     "nltk.download('brown') \n",
     "# Only the brown corpus is needed in case you don't have it.\n",
     "\n",
     "# Generate brown corpus text file\n",
-    "with open('brown_corp.txt', 'w+') as f:\n",
+    "with smart_open('brown_corp.txt', 'w+') as f:\n",
     "    for word in nltk.corpus.brown.words():\n",
     "        f.write('{word} '.format(word=word))\n",
     "\n",

diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb
@@ -38,20 +38,21 @@
    ],
    "source": [
     "import nltk\n",
+    "from smart_open import smart_open\n",
     "from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n",
     "\n",
     "# Only the brown corpus is needed in case you don't have it.\n",
     "nltk.download('brown') \n",
     "\n",
     "# Generate brown corpus text file\n",
-    "with open('brown_corp.txt', 'w+') as f:\n",
+    "with smart_open('brown_corp.txt', 'w+') as f:\n",
     "    for word in nltk.corpus.brown.words():\n",
     "        f.write('{word} '.format(word=word))\n",
     "    f.seek(0)\n",
     "    brown = f.read()\n",
     "\n",
     "# Preprocess brown corpus\n",
-    "with open('proc_brown_corp.txt', 'w') as f:\n",
+    "with smart_open('proc_brown_corp.txt', 'w') as f:\n",
     "    proc_brown = strip_punctuation(brown)\n",
     "    proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n",
     "    f.write(proc_brown)\n",
@@ -1004,12 +1005,13 @@
     "import copy\n",
     "import multiprocessing\n",
     "import numpy as np\n",
+    "from smart_open import smart_open\n",
     "\n",
     "\n",
     "def compute_accuracies(model, freq):\n",
     "    # mean_freq will contain analogies together with the mean frequency of 4 words involved\n",
     "    mean_freq = {}\n",
-    "    with open(word_analogies_file, 'r') as r:\n",
+    "    with smart_open(word_analogies_file, 'r') as r:\n",
     "        for i, line in enumerate(r):\n",
     "            if ':' not in line:\n",
     "                analogy = tuple(line.split())\n",

diff --git a/docs/notebooks/atmodel_tutorial.ipynb b/docs/notebooks/atmodel_tutorial.ipynb
@@ -105,6 +105,7 @@
    "outputs": [],
    "source": [
     "import os, re\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# Folder containing all NIPS papers.\n",
     "data_dir = '/tmp/nipstxt/'  # Set this path to the data on your machine.\n",
@@ -125,7 +126,7 @@
     "        \n",
     "        # Read document text.\n",
     "        # Note: ignoring characters that cause encoding errors.\n",
-    "        with open(data_dir + yr_dir + '/' + filen, errors='ignore', encoding='utf-8') as fid:\n",
+    "        with smart_open(data_dir + yr_dir + '/' + filen, encoding='utf-8', 'rb') as fid:\n",
     "            txt = fid.read()\n",
     "            \n",
     "        # Replace any whitespace (newline, tabs, etc.) by a single space.\n",
@@ -149,6 +150,7 @@
    },
    "outputs": [],
    "source": [
+    "from smart_open import smart_open\n",
     "filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs]  # Using the years defined in previous cell.\n",
     "\n",
     "# Get all author names and their corresponding document IDs.\n",
@@ -157,7 +159,7 @@
     "for yr in yrs:\n",
     "    # The files \"a00.txt\" and so on contain the author-document mappings.\n",
     "    filename = data_dir + 'idx/a' + yr + '.txt'\n",
-    "    for line in open(filename, errors='ignore', encoding='utf-8'):\n",
+    "    for line in smart_open(filename, errors='ignore', encoding='utf-8', 'rb'):\n",
     "        # Each line corresponds to one author.\n",
     "        contents = re.split(',', line)\n",
     "        author_name = (contents[1] + contents[0]).strip()\n",

diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb
@@ -109,6 +109,7 @@
     "    return norm_text\n",
     "\n",
     "import time\n",
+    "import smart_open\n",
     "start = time.clock()\n",
     "\n",
     "if not os.path.isfile('aclImdb/alldata-id.txt'):\n",
@@ -118,7 +119,7 @@
     "            print(\"Downloading IMDB archive...\")\n",
     "            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n",
     "            r = requests.get(url)\n",
-    "            with open(filename, 'wb') as f:\n",
+    "            with smart_open.smart_open(filename, 'wb') as f:\n",
     "                f.write(r.content)\n",
     "        tar = tarfile.open(filename, mode='r')\n",
     "        tar.extractall()\n",
@@ -190,11 +191,13 @@
     "import gensim\n",
     "from gensim.models.doc2vec import TaggedDocument\n",
     "from collections import namedtuple\n",
+    "from smart_open import smart_open\n",
     "\n",
     "SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n",
     "\n",
     "alldocs = []  # Will hold all docs in original order\n",
-    "with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n",
+    "with smart_open('aclImdb/alldata-id.txt', 'rb') as alldata:\n",
+    "    alldata = alldata.read().decode("utf-8")\n",
     "    for line_no, line in enumerate(alldata):\n",
     "        tokens = gensim.utils.to_unicode(line).split()\n",
     "        words = tokens[1:]\n",

diff --git a/docs/notebooks/gensim_news_classification.ipynb b/docs/notebooks/gensim_news_classification.ipynb
@@ -63,6 +63,7 @@
     "from gensim.models.wrappers import LdaMallet\n",
     "from gensim.corpora import Dictionary\n",
     "from pprint import pprint\n",
+    "from smart_open import smart_open\n",
     "\n",
     "%matplotlib inline"
    ]
@@ -122,7 +123,7 @@
     }
    ],
    "source": [
-    "with open(lee_train_file) as f:\n",
+    "with smart_open(lee_train_file, 'rb') as f:\n",
     "    for n, l in enumerate(f):\n",
     "        if n < 5:\n",
     "            print([l])"
@@ -151,7 +152,7 @@
     "    -------\n",
     "    yields preprocessed line\n",
     "    \"\"\"\n",
-    "    with open(fname) as f:\n",
+    "    with smart_open(fname, 'rb') as f:\n",
     "        for line in f:\n",
     "            yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)"
    ]

diff --git a/docs/notebooks/lda_training_tips.ipynb b/docs/notebooks/lda_training_tips.ipynb
@@ -53,6 +53,7 @@
     "# Read data.\n",
     "\n",
     "import os\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# Folder containing all NIPS papers.\n",
     "data_dir = 'nipstxt/'\n",
@@ -67,7 +68,7 @@
     "    files = os.listdir(data_dir + yr_dir)\n",
     "    for filen in files:\n",
     "        # Note: ignoring characters that cause encoding errors.\n",
-    "        with open(data_dir + yr_dir + '/' + filen, errors='ignore') as fid:\n",
+    "        with smart_open(data_dir + yr_dir + '/' + filen, 'rb') as fid:\n",
     "            txt = fid.read()\n",
     "        docs.append(txt)"
    ]

diff --git a/docs/notebooks/online_w2v_tutorial.ipynb b/docs/notebooks/online_w2v_tutorial.ipynb
@@ -28,7 +28,8 @@
     "from gensim.models.word2vec import Word2Vec, LineSentence\n",
     "from pprint import pprint\n",
     "from copy import deepcopy\n",
-    "from multiprocessing import cpu_count"
+    "from multiprocessing import cpu_count\n",
+    "from smart_open import smart_open"
    ]
   },
   {
@@ -93,7 +94,7 @@
    "outputs": [],
    "source": [
     "def write_wiki(wiki, name, titles = []):\n",
-    "    with open('{}.wiki'.format(name), 'wb') as f:\n",
+    "    with smart_open('{}.wiki'.format(name), 'wb') as f:\n",
     "        wiki.metadata = True\n",
     "        for text, (page_id, title) in wiki.get_texts():\n",
     "            if title not in titles:\n",

diff --git a/docs/notebooks/poincare/poincare_numpy.patch b/docs/notebooks/poincare/poincare_numpy.patch
@@ -291,7 +291,7 @@ index ecae36e..f85bf22 100644
 +                emb[neg[1]] = update(emb[neg[1]], -1*der_neg[1])
 +        print('Epoch #%d, time taken: %.2f seconds' % (epoch + 1, time.time() - last_time))
 +        last_time = time.time()
-+    pickle.dump(emb, open(output_file, 'wb'))
++    pickle.dump(emb, smart_open(output_file, 'wb'))
 +
 +
 +if __name__ == "__main__":

diff --git a/docs/notebooks/test_notebooks.py b/docs/notebooks/test_notebooks.py
@@ -6,6 +6,7 @@
 import nbformat
 from nbconvert.preprocessors import ExecutePreprocessor
 from nbconvert.preprocessors.execute import CellExecutionError
+"from smart_open import smart_open\n",
 
 
 def _notebook_run(path):
@@ -16,7 +17,7 @@ def _notebook_run(path):
     this_file_directory = os.path.dirname(__file__)
     errors = []
     with tempfile.NamedTemporaryFile(suffix=".ipynb", mode='wt') as fout:
-        with open(path) as f:
+        with smart_open(path, 'rb') as f:
             nb = nbformat.read(f, as_version=4)
             nb.metadata.get('kernelspec', {})['name'] = kernel_name
             ep = ExecutePreprocessor(kernel_name=kernel_name, timeout=10)

diff --git a/docs/notebooks/topic_coherence-movies.ipynb b/docs/notebooks/topic_coherence-movies.ipynb
@@ -38,7 +38,8 @@
     "from datetime import datetime\n",
     "\n",
     "from gensim.models import CoherenceModel\n",
-    "from gensim.corpora.dictionary import Dictionary"
+    "from gensim.corpora.dictionary import Dictionary\n",
+    "from smart_open import smart_open"
    ]
   },
   {
@@ -114,7 +115,7 @@
     "    # as well as pages about a single year.\n",
     "    # As a result, this preprocessing differs from the paper.\n",
     "    \n",
-    "    with open(os.path.join(data_dir, fname)) as f:\n",
+    "    with smart_open(os.path.join(data_dir, fname), 'rb') as f:\n",
     "        for line in f:\n",
     "            # lower case all words\n",
     "            lowered = line.lower()\n",
@@ -206,7 +207,7 @@
    ],
    "source": [
     "topics = []  # list of 100 topics\n",
-    "with open(topics_path) as f:\n",
+    "with smart_open(topics_path, 'rb') as f:\n",
     "    topics = [line.split() for line in f if line]\n",
     "len(topics)"
    ]
@@ -231,7 +232,7 @@
    ],
    "source": [
     "human_scores = []\n",
-    "with open(human_scores_path) as f:\n",
+    "with smart_open(human_scores_path, 'rb') as f:\n",
     "    for line in f:\n",
     "        human_scores.append(float(line.strip()))\n",
     "len(human_scores)"

diff --git a/docs/notebooks/word2vec.ipynb b/docs/notebooks/word2vec.ipynb
@@ -109,13 +109,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from smart_open import smart_open\n",
     "class MySentences(object):\n",
     "    def __init__(self, dirname):\n",
     "        self.dirname = dirname\n",
     " \n",
     "    def __iter__(self):\n",
     "        for fname in os.listdir(self.dirname):\n",
-    "            for line in open(os.path.join(self.dirname, fname)):\n",
+    "            for line in smart_open(os.path.join(self.dirname, fname), 'rb'):\n",
     "                yield line.split()"
    ]
   },