-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
* Replace open() with smart_open() * Fix in Corpora_and_Vector_Spaces.ipynb * Specify read/write explicitly while calling smart_open()
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -109,6 +109,7 @@ | |
" return norm_text\n", | ||
"\n", | ||
"import time\n", | ||
"import smart_open\n", | ||
"start = time.clock()\n", | ||
"\n", | ||
"if not os.path.isfile('aclImdb/alldata-id.txt'):\n", | ||
|
@@ -118,7 +119,7 @@ | |
" print(\"Downloading IMDB archive...\")\n", | ||
" url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n", | ||
" r = requests.get(url)\n", | ||
" with open(filename, 'wb') as f:\n", | ||
" with smart_open.smart_open(filename, 'wb') as f:\n", | ||
" f.write(r.content)\n", | ||
" tar = tarfile.open(filename, mode='r')\n", | ||
" tar.extractall()\n", | ||
|
@@ -190,11 +191,13 @@ | |
"import gensim\n", | ||
"from gensim.models.doc2vec import TaggedDocument\n", | ||
"from collections import namedtuple\n", | ||
"from smart_open import smart_open\n", | ||
"\n", | ||
"SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n", | ||
"\n", | ||
"alldocs = [] # Will hold all docs in original order\n", | ||
"with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n", | ||
"with smart_open('aclImdb/alldata-id.txt', 'rb') as alldata:\n", | ||
" alldata = alldata.read().decode("utf-8")\n", | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
menshikh-iv
Contributor
|
||
" for line_no, line in enumerate(alldata):\n", | ||
" tokens = gensim.utils.to_unicode(line).split()\n", | ||
" words = tokens[1:]\n", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,7 +38,8 @@ | |
"from datetime import datetime\n", | ||
"\n", | ||
"from gensim.models import CoherenceModel\n", | ||
"from gensim.corpora.dictionary import Dictionary" | ||
"from gensim.corpora.dictionary import Dictionary\n", | ||
"from smart_open import smart_open" | ||
] | ||
}, | ||
{ | ||
|
@@ -114,7 +115,7 @@ | |
" # as well as pages about a single year.\n", | ||
" # As a result, this preprocessing differs from the paper.\n", | ||
" \n", | ||
" with open(os.path.join(data_dir, fname)) as f:\n", | ||
" with smart_open(os.path.join(data_dir, fname), 'rb') as f:\n", | ||
" for line in f:\n", | ||
" # lower case all words\n", | ||
" lowered = line.lower()\n", | ||
|
@@ -206,7 +207,7 @@ | |
], | ||
"source": [ | ||
"topics = [] # list of 100 topics\n", | ||
"with open(topics_path) as f:\n", | ||
"with smart_open(topics_path, 'rb') as f:\n", | ||
" topics = [line.split() for line in f if line]\n", | ||
"len(topics)" | ||
] | ||
|
@@ -231,7 +232,7 @@ | |
], | ||
"source": [ | ||
"human_scores = []\n", | ||
"with open(human_scores_path) as f:\n", | ||
"with smart_open(human_scores_path, 'rb') as f:\n", | ||
" for line in f:\n", | ||
This comment has been minimized.
Sorry, something went wrong.
piskvorky
Owner
|
||
" human_scores.append(float(line.strip()))\n", | ||
"len(human_scores)" | ||
|
@sharanry @menshikh-iv - this double-quoting error may be breaking github-rendering of this notebook – and perhaps jupyter-viewing too? Was this loaded/tested before commit?