Skip to content

Commit

Permalink
Replace open() with smart_open() in notebooks. Fix #1789 (#1812)
Browse files Browse the repository at this point in the history
* Replace open() with smart_open()

* Fix in Corpora_and_Vector_Spaces.ipynb

* Specify read/write explicitly while calling smart_open()
  • Loading branch information
sharanry authored and menshikh-iv committed Mar 10, 2018
1 parent 2ee7a2c commit 17f3719
Show file tree
Hide file tree
Showing 15 changed files with 53 additions and 35 deletions.
6 changes: 4 additions & 2 deletions docs/notebooks/Corpora_and_Vector_Spaces.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,10 @@
},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"class MyCorpus(object):\n",
" def __iter__(self):\n",
" for line in open('datasets/mycorpus.txt'):\n",
" for line in smart_open('datasets/mycorpus.txt', 'rb'):\n",
" # assume there's one document per line, tokens separated by whitespace\n",
" yield dictionary.doc2bow(line.lower().split())"
]
Expand Down Expand Up @@ -374,9 +375,10 @@
],
"source": [
"from six import iteritems\n",
"from smart_open import smart_open\n",
"\n",
"# collect statistics about all tokens\n",
"dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
"dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n",
"\n",
"# remove stop words and words that appear only once\n",
"stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",
Expand Down
16 changes: 8 additions & 8 deletions docs/notebooks/Poincare Evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@
" parts = first_line.rstrip().split(\"\\t\")\n",
" model_size = len(parts) - 1\n",
" vocab_size = len(lines)\n",
" with open(output_file, 'w') as f:\n",
" with smart_open(output_file, 'w') as f:\n",
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
" for line in lines:\n",
" f.write(line.replace('\\t', ' '))\n",
Expand All @@ -709,7 +709,7 @@
" \n",
" model_size = random_embedding.shape[0]\n",
" vocab_size = len(np_embeddings)\n",
" with open(output_file, 'w') as f:\n",
" with smart_open(output_file, 'w') as f:\n",
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
" for key, vector in np_embeddings.items():\n",
" vector_string = ' '.join('%.6f' % value for value in vector)\n",
Expand Down Expand Up @@ -1113,7 +1113,7 @@
" test_line_candidates = []\n",
" line_count = 0\n",
" all_nodes = set()\n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for i, line in enumerate(f):\n",
" node_1, node_2 = line.split()\n",
" all_nodes.update([node_1, node_2])\n",
Expand All @@ -1135,9 +1135,9 @@
" train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n",
" \n",
" train_set_nodes = set()\n",
" with open(data_file, 'rb') as f:\n",
" train_file = open(train_filename, 'wb')\n",
" test_file = open(test_filename, 'wb')\n",
" with smart_open(data_file, 'rb') as f:\n",
" train_file = smart_open(train_filename, 'wb')\n",
" test_file = smart_open(test_filename, 'wb')\n",
" for i, line in enumerate(f):\n",
" if i in train_line_indices:\n",
" train_set_nodes.update(line.split())\n",
Expand Down Expand Up @@ -1169,13 +1169,13 @@
" \"\"\"\n",
" root_candidates = set()\n",
" leaf_candidates = set()\n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for line in f:\n",
" nodes = line.split()\n",
" root_candidates.update(nodes)\n",
" leaf_candidates.update(nodes)\n",
" \n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for line in f:\n",
" node_1, node_2 = line.split()\n",
" if node_1 == node_2:\n",
Expand Down
9 changes: 5 additions & 4 deletions docs/notebooks/Tensorboard_visualizations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@
"import pandas as pd\n",
"import smart_open\n",
"import random\n",
"from smart_open import smart_open\n",
"\n",
"# read data\n",
"dataframe = pd.read_csv('movie_plots.csv')\n",
Expand Down Expand Up @@ -803,7 +804,7 @@
},
"outputs": [],
"source": [
"with open('movie_plot_metadata.tsv','w') as w:\n",
"with smart_open('movie_plot_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for i,j in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (i,j))"
Expand Down Expand Up @@ -1024,14 +1025,14 @@
"outputs": [],
"source": [
"# create file for tensors\n",
"with open('doc_lda_tensor.tsv','w') as w:\n",
"with smart_open('doc_lda_tensor.tsv','w') as w:\n",
" for doc_topics in all_topics:\n",
" for topics in doc_topics:\n",
" w.write(str(topics[1])+ \"\\t\")\n",
" w.write(\"\\n\")\n",
" \n",
"# create file for metadata\n",
"with open('doc_lda_metadata.tsv','w') as w:\n",
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for j, k in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (j, k))"
Expand Down Expand Up @@ -1084,7 +1085,7 @@
"\n",
"# overwrite metadata file\n",
"i=0\n",
"with open('doc_lda_metadata.tsv','w') as w:\n",
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for j,k in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n",
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/WMD_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@
"start = time()\n",
"\n",
"import json\n",
"from smart_open import smart_open\n",
"\n",
"# Business IDs of the restaurants.\n",
"ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n",
Expand All @@ -310,7 +311,7 @@
"w2v_corpus = [] # Documents to train word2vec on (all 6 restaurants).\n",
"wmd_corpus = [] # Documents to run queries against (only one restaurant).\n",
"documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents).\n",
"with open('/data/yelp_academic_dataset_review.json') as data_file:\n",
"with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n",
" for line in data_file:\n",
" json_line = json.loads(line)\n",
" \n",
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/Word2Vec_FastText_Comparison.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,12 @@
],
"source": [
"import nltk\n",
"from smart_open import smart_open\n",
"nltk.download('brown') \n",
"# Only the brown corpus is needed in case you don't have it.\n",
"\n",
"# Generate brown corpus text file\n",
"with open('brown_corp.txt', 'w+') as f:\n",
"with smart_open('brown_corp.txt', 'w+') as f:\n",
" for word in nltk.corpus.brown.words():\n",
" f.write('{word} '.format(word=word))\n",
"\n",
Expand Down
8 changes: 5 additions & 3 deletions docs/notebooks/Wordrank_comparisons.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,21 @@
],
"source": [
"import nltk\n",
"from smart_open import smart_open\n",
"from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n",
"\n",
"# Only the brown corpus is needed in case you don't have it.\n",
"nltk.download('brown') \n",
"\n",
"# Generate brown corpus text file\n",
"with open('brown_corp.txt', 'w+') as f:\n",
"with smart_open('brown_corp.txt', 'w+') as f:\n",
" for word in nltk.corpus.brown.words():\n",
" f.write('{word} '.format(word=word))\n",
" f.seek(0)\n",
" brown = f.read()\n",
"\n",
"# Preprocess brown corpus\n",
"with open('proc_brown_corp.txt', 'w') as f:\n",
"with smart_open('proc_brown_corp.txt', 'w') as f:\n",
" proc_brown = strip_punctuation(brown)\n",
" proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n",
" f.write(proc_brown)\n",
Expand Down Expand Up @@ -1004,12 +1005,13 @@
"import copy\n",
"import multiprocessing\n",
"import numpy as np\n",
"from smart_open import smart_open\n",
"\n",
"\n",
"def compute_accuracies(model, freq):\n",
" # mean_freq will contain analogies together with the mean frequency of 4 words involved\n",
" mean_freq = {}\n",
" with open(word_analogies_file, 'r') as r:\n",
" with smart_open(word_analogies_file, 'r') as r:\n",
" for i, line in enumerate(r):\n",
" if ':' not in line:\n",
" analogy = tuple(line.split())\n",
Expand Down
6 changes: 4 additions & 2 deletions docs/notebooks/atmodel_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
"outputs": [],
"source": [
"import os, re\n",
"from smart_open import smart_open\n",
"\n",
"# Folder containing all NIPS papers.\n",
"data_dir = '/tmp/nipstxt/' # Set this path to the data on your machine.\n",
Expand All @@ -125,7 +126,7 @@
" \n",
" # Read document text.\n",
" # Note: ignoring characters that cause encoding errors.\n",
" with open(data_dir + yr_dir + '/' + filen, errors='ignore', encoding='utf-8') as fid:\n",
" with smart_open(data_dir + yr_dir + '/' + filen, encoding='utf-8', 'rb') as fid:\n",
" txt = fid.read()\n",
" \n",
" # Replace any whitespace (newline, tabs, etc.) by a single space.\n",
Expand All @@ -149,6 +150,7 @@
},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs] # Using the years defined in previous cell.\n",
"\n",
"# Get all author names and their corresponding document IDs.\n",
Expand All @@ -157,7 +159,7 @@
"for yr in yrs:\n",
" # The files \"a00.txt\" and so on contain the author-document mappings.\n",
" filename = data_dir + 'idx/a' + yr + '.txt'\n",
" for line in open(filename, errors='ignore', encoding='utf-8'):\n",
" for line in smart_open(filename, errors='ignore', encoding='utf-8', 'rb'):\n",
" # Each line corresponds to one author.\n",
" contents = re.split(',', line)\n",
" author_name = (contents[1] + contents[0]).strip()\n",
Expand Down
7 changes: 5 additions & 2 deletions docs/notebooks/doc2vec-IMDB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
" return norm_text\n",
"\n",
"import time\n",
"import smart_open\n",
"start = time.clock()\n",
"\n",
"if not os.path.isfile('aclImdb/alldata-id.txt'):\n",
Expand All @@ -118,7 +119,7 @@
" print(\"Downloading IMDB archive...\")\n",
" url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n",
" r = requests.get(url)\n",
" with open(filename, 'wb') as f:\n",
" with smart_open.smart_open(filename, 'wb') as f:\n",
" f.write(r.content)\n",
" tar = tarfile.open(filename, mode='r')\n",
" tar.extractall()\n",
Expand Down Expand Up @@ -190,11 +191,13 @@
"import gensim\n",
"from gensim.models.doc2vec import TaggedDocument\n",
"from collections import namedtuple\n",
"from smart_open import smart_open\n",
"\n",
"SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n",
"\n",
"alldocs = [] # Will hold all docs in original order\n",
"with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n",
"with smart_open('aclImdb/alldata-id.txt', 'rb') as alldata:\n",
" alldata = alldata.read().decode("utf-8")\n",

This comment has been minimized.

Copy link
@gojomo

gojomo Mar 12, 2018

Collaborator

@sharanry @menshikh-iv - this double-quoting error may be breaking github-rendering of this notebook – and perhaps jupyter-viewing too? Was this loaded/tested before commit?

This comment has been minimized.

Copy link
@menshikh-iv

menshikh-iv Mar 13, 2018

Contributor

@gojomo yes, you were right, this was a quoting problem, I fixed it in 7459831, now this rendered correctly.

About testing - I already describe my position (about this PR #1812 (comment) and after, about notebooks in general #1964 (comment))

This comment has been minimized.

Copy link
@dvdff

dvdff Apr 17, 2018

Changing smart open to open breaks this code (as it now reads in every letter instead of everyline).

This comment has been minimized.

Copy link
@menshikh-iv

menshikh-iv Apr 17, 2018

Contributor

need to replace read to readlines in this case, thanks for the report @dvdff

This comment has been minimized.

Copy link
@gojomo

gojomo Apr 17, 2018

Collaborator

Shouldn't smart_open() be sufficiently behavior-compatible that the prior enumerate() on the return-value of smart_open() works the same as it did on the return-value of open()? (Neither read() nor readlines() required?)

This comment has been minimized.

Copy link
@menshikh-iv

menshikh-iv Apr 17, 2018

Contributor

yes, you are right, but in this case, alldata already string (not a file-like object), this is a mistake here.

This comment has been minimized.

Copy link
@gojomo

gojomo Apr 17, 2018

Collaborator

No, before this patch, it wasn't a string - it was the file-like returned by open().

This comment has been minimized.

Copy link
@menshikh-iv

menshikh-iv Apr 18, 2018

Contributor

I talked about the current code state, now this is a bug anyway

" for line_no, line in enumerate(alldata):\n",
" tokens = gensim.utils.to_unicode(line).split()\n",
" words = tokens[1:]\n",
Expand Down
5 changes: 3 additions & 2 deletions docs/notebooks/gensim_news_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"from gensim.models.wrappers import LdaMallet\n",
"from gensim.corpora import Dictionary\n",
"from pprint import pprint\n",
"from smart_open import smart_open\n",
"\n",
"%matplotlib inline"
]
Expand Down Expand Up @@ -122,7 +123,7 @@
}
],
"source": [
"with open(lee_train_file) as f:\n",
"with smart_open(lee_train_file, 'rb') as f:\n",
" for n, l in enumerate(f):\n",
" if n < 5:\n",
" print([l])"
Expand Down Expand Up @@ -151,7 +152,7 @@
" -------\n",
" yields preprocessed line\n",
" \"\"\"\n",
" with open(fname) as f:\n",
" with smart_open(fname, 'rb') as f:\n",
" for line in f:\n",
" yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)"
]
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/lda_training_tips.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"# Read data.\n",
"\n",
"import os\n",
"from smart_open import smart_open\n",
"\n",
"# Folder containing all NIPS papers.\n",
"data_dir = 'nipstxt/'\n",
Expand All @@ -67,7 +68,7 @@
" files = os.listdir(data_dir + yr_dir)\n",
" for filen in files:\n",
" # Note: ignoring characters that cause encoding errors.\n",
" with open(data_dir + yr_dir + '/' + filen, errors='ignore') as fid:\n",
" with smart_open(data_dir + yr_dir + '/' + filen, 'rb') as fid:\n",
" txt = fid.read()\n",
" docs.append(txt)"
]
Expand Down
5 changes: 3 additions & 2 deletions docs/notebooks/online_w2v_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
"from gensim.models.word2vec import Word2Vec, LineSentence\n",
"from pprint import pprint\n",
"from copy import deepcopy\n",
"from multiprocessing import cpu_count"
"from multiprocessing import cpu_count\n",
"from smart_open import smart_open"
]
},
{
Expand Down Expand Up @@ -93,7 +94,7 @@
"outputs": [],
"source": [
"def write_wiki(wiki, name, titles = []):\n",
" with open('{}.wiki'.format(name), 'wb') as f:\n",
" with smart_open('{}.wiki'.format(name), 'wb') as f:\n",
" wiki.metadata = True\n",
" for text, (page_id, title) in wiki.get_texts():\n",
" if title not in titles:\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/notebooks/poincare/poincare_numpy.patch
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ index ecae36e..f85bf22 100644
+ emb[neg[1]] = update(emb[neg[1]], -1*der_neg[1])
+ print('Epoch #%d, time taken: %.2f seconds' % (epoch + 1, time.time() - last_time))
+ last_time = time.time()
+ pickle.dump(emb, open(output_file, 'wb'))
+ pickle.dump(emb, smart_open(output_file, 'wb'))
+
+
+if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/test_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors.execute import CellExecutionError
"from smart_open import smart_open\n",


def _notebook_run(path):
Expand All @@ -16,7 +17,7 @@ def _notebook_run(path):
this_file_directory = os.path.dirname(__file__)
errors = []
with tempfile.NamedTemporaryFile(suffix=".ipynb", mode='wt') as fout:
with open(path) as f:
with smart_open(path, 'rb') as f:
nb = nbformat.read(f, as_version=4)
nb.metadata.get('kernelspec', {})['name'] = kernel_name
ep = ExecutePreprocessor(kernel_name=kernel_name, timeout=10)
Expand Down
9 changes: 5 additions & 4 deletions docs/notebooks/topic_coherence-movies.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
"from datetime import datetime\n",
"\n",
"from gensim.models import CoherenceModel\n",
"from gensim.corpora.dictionary import Dictionary"
"from gensim.corpora.dictionary import Dictionary\n",
"from smart_open import smart_open"
]
},
{
Expand Down Expand Up @@ -114,7 +115,7 @@
" # as well as pages about a single year.\n",
" # As a result, this preprocessing differs from the paper.\n",
" \n",
" with open(os.path.join(data_dir, fname)) as f:\n",
" with smart_open(os.path.join(data_dir, fname), 'rb') as f:\n",
" for line in f:\n",
" # lower case all words\n",
" lowered = line.lower()\n",
Expand Down Expand Up @@ -206,7 +207,7 @@
],
"source": [
"topics = [] # list of 100 topics\n",
"with open(topics_path) as f:\n",
"with smart_open(topics_path, 'rb') as f:\n",
" topics = [line.split() for line in f if line]\n",
"len(topics)"
]
Expand All @@ -231,7 +232,7 @@
],
"source": [
"human_scores = []\n",
"with open(human_scores_path) as f:\n",
"with smart_open(human_scores_path, 'rb') as f:\n",
" for line in f:\n",

This comment has been minimized.

Copy link
@piskvorky

piskvorky Apr 17, 2018

Owner

Does this work? What does "line" mean in a binary file ('rb' mode)?

(here and elsewhere)

" human_scores.append(float(line.strip()))\n",
"len(human_scores)"
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/word2vec.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,14 @@
"metadata": {},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"class MySentences(object):\n",
" def __init__(self, dirname):\n",
" self.dirname = dirname\n",
" \n",
" def __iter__(self):\n",
" for fname in os.listdir(self.dirname):\n",
" for line in open(os.path.join(self.dirname, fname)):\n",
" for line in smart_open(os.path.join(self.dirname, fname), 'rb'):\n",
" yield line.split()"
]
},
Expand Down

0 comments on commit 17f3719

Please sign in to comment.