From ec8c016767ae57a74a40a1cf3e56566c72d41272 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Sun, 30 Jul 2017 04:59:25 +0530 Subject: [PATCH 01/15] added download and catalogue functions --- gensim/__main__.py | 31 +++++++++++++++++++++++++++++++ gensim/api/__init__.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 gensim/__main__.py create mode 100644 gensim/api/__init__.py diff --git a/gensim/__main__.py b/gensim/__main__.py new file mode 100644 index 0000000000..b07862288b --- /dev/null +++ b/gensim/__main__.py @@ -0,0 +1,31 @@ +from __future__ import print_function +from __future__ import absolute_import +import sys +import argparse +from .api import download +from .api import catalogue +if __name__ == '__main__': + parser = argparse.ArgumentParser(description = "Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument("-d" ,"--download", nargs = 1,help = "To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-l", "--link", nargs = 2,help=" To store a shortcut to a corpus/model : python -m gensim -l source destination") + group.add_argument("-i", "--info",nargs = 1, help = "To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-c","--catalogue", help ="To get the list of all models/corpus stored : python -m gensim -c",action="store_true") + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + + elif sys.argv[1] == "-l" or sys.argv[1] =="--link": + link(sys.argv[2],sys.argv[3]) + + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() + + + + + + diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py new file mode 100644 index 0000000000..7f4ff7ed4b --- /dev/null +++ b/gensim/api/__init__.py @@ -0,0 +1,29 @@ +import subprocess +import json +import sys +import os +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen +def download(file): + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" + data = catalogue() + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file not in corpuses and file not in models: + print("Incorrect corpus/model name.") + else: + url = url+file+"/"+file+".tar.gz" + print("Downloading {m}".format(m=file)) + subprocess.call([sys.executable, '-m','pip','install','--no-cache-dir',url],env = os.environ.copy()) + +def catalogue(print_list = 0): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + if print_list == 1 : + print(json.loads(data)) + else: + return json.loads(data) + From 636bfffb82233564e7f6fe6febf4ed97d0f713c6 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Mon, 31 Jul 2017 21:17:58 +0530 Subject: [PATCH 02/15] added link and info --- gensim/__main__.py | 2 ++ gensim/api/__init__.py | 51 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index b07862288b..8f82c8089a 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -4,6 +4,8 @@ import argparse from .api import download from .api import catalogue +from .api import link +from .api import info if __name__ == '__main__': parser = argparse.ArgumentParser(description = "Gensim console API") group = parser.add_mutually_exclusive_group() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 7f4ff7ed4b..d0e1e5ea2b 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -2,13 +2,17 @@ import json import sys import os +import pip +import importlib +from pathlib import Path +import os try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen def download(file): url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" - data = catalogue() + data = catalogue(False) corpuses = data['gensim']['corpus'] models = data['gensim']['model'] if file not in corpuses and file not in models: @@ -18,12 +22,51 @@ def download(file): print("Downloading {m}".format(m=file)) subprocess.call([sys.executable, '-m','pip','install','--no-cache-dir',url],env = os.environ.copy()) -def catalogue(print_list = 0): +def catalogue(print_list = True): url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" response = urlopen(url) data = response.read().decode("utf-8") if print_list == 1 : - print(json.loads(data)) + data = json.loads(data) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) else: return json.loads(data) - + +def link(file_name, shortcut_name): + packages = pip.get_installed_distributions() + package_is_installed = 0 + for package in packages: + if package.project_name == file_name: + package_is_installed = 1 + break + if package_is_installed == 0: + print("The model/corpus {f} has not been installed. Please install it using : python -m gensim -d {f}".format(f=file_name)) + package = importlib.import_module(file_name) + package_path = Path(package.__file__).parent.parent + package_path = package_path / file_name / file_name + gensim_path = importlib.import_module("gensim") + gensim_path = Path(gensim_path.__file__).parent + shortcut_path = gensim_path / 'data' / shortcut_name + + try: + os.symlink(str(package_path),str(shortcut_path)) + except: + print("Shortcut creation failed in gensim/data.") + print("Shortcut creation successful. The model/corpus can now be found in gensim/data.") + +def info(file_name): + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) \ No newline at end of file From fffe203303c57966bd725450773a64c922413c51 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 1 Aug 2017 17:08:21 +0530 Subject: [PATCH 03/15] modeified link and info functions --- gensim/__main__.py | 53 ++++++++++-------- gensim/api/__init__.py | 121 +++++++++++++++++++++++------------------ 2 files changed, 97 insertions(+), 77 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 8f82c8089a..48f299b21e 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -7,27 +7,32 @@ from .api import link from .api import info if __name__ == '__main__': - parser = argparse.ArgumentParser(description = "Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument("-d" ,"--download", nargs = 1,help = "To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-l", "--link", nargs = 2,help=" To store a shortcut to a corpus/model : python -m gensim -l source destination") - group.add_argument("-i", "--info",nargs = 1, help = "To get information about a corpus/model : python -m gensim -i model/corpus name") - group.add_argument("-c","--catalogue", help ="To get the list of all models/corpus stored : python -m gensim -c",action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - - elif sys.argv[1] == "-l" or sys.argv[1] =="--link": - link(sys.argv[2],sys.argv[3]) - - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() - - - - - - + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument( + "-d", "--download", nargs=1, + help="To download a corpus/model : python -m gensim -d " + "corpus/model name") + group.add_argument( + "-l", "--link", nargs=2, + help="To store a shortcut to a corpus/model : python -m gensim -l " + "source destination") + group.add_argument( + "-i", "--info", nargs=1, + help="To get information about a corpus/model : python -m gensim -i " + "model/corpus name") + group.add_argument( + "-c", "--catalogue", help="To get the list of all models/corpus stored" + " : python -m gensim -c", action="store_true") + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + + elif sys.argv[1] == "-l" or sys.argv[1] == "--link": + link(sys.argv[2], sys.argv[3]) + + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index d0e1e5ea2b..7c64011d52 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -10,63 +10,78 @@ from urllib.request import urlopen except ImportError: from urllib2 import urlopen + + def download(file): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file not in corpuses and file not in models: - print("Incorrect corpus/model name.") - else: - url = url+file+"/"+file+".tar.gz" - print("Downloading {m}".format(m=file)) - subprocess.call([sys.executable, '-m','pip','install','--no-cache-dir',url],env = os.environ.copy()) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file not in corpuses and file not in models: + print("Incorrect corpus/model name.") + else: + url = url+file+"/"+file+".tar.gz" + print("Downloading {m}".format(m=file)) + subprocess.call( + [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', url], + env=os.environ.copy()) + + +def catalogue(print_list=True): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + if print_list == 1: + data = json.loads(data) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) + else: + return json.loads(data) -def catalogue(print_list = True): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" - response = urlopen(url) - data = response.read().decode("utf-8") - if print_list == 1 : - data = json.loads(data) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - else: - return json.loads(data) def link(file_name, shortcut_name): - packages = pip.get_installed_distributions() - package_is_installed = 0 - for package in packages: - if package.project_name == file_name: - package_is_installed = 1 - break - if package_is_installed == 0: - print("The model/corpus {f} has not been installed. Please install it using : python -m gensim -d {f}".format(f=file_name)) - package = importlib.import_module(file_name) - package_path = Path(package.__file__).parent.parent - package_path = package_path / file_name / file_name - gensim_path = importlib.import_module("gensim") - gensim_path = Path(gensim_path.__file__).parent - shortcut_path = gensim_path / 'data' / shortcut_name + packages = pip.get_installed_distributions() + package_is_installed = 0 + for package in packages: + if package.project_name == file_name: + package_is_installed = 1 + break + if package_is_installed == 0: + print("The model/corpus {f} has not been installed".format(f=file_name)) + print("For installing use: python -m gensim -d {f}".format(f=file_name)) + sys.exit(0) + package = importlib.import_module(file_name) + package_path = Path(package.__file__).parent.parent + package_path = package_path / file_name / file_name + gensim_path = importlib.import_module("gensim") + gensim_path = Path(gensim_path.__file__).parent + shortcut_path = gensim_path / 'data' / shortcut_name + if os.path.exists(str(shortcut_path)): + print("This shortcut link already exists.") + sys.exit(0) + try: + os.symlink(str(package_path), str(shortcut_path)) + except: + print("Shortcut creation failed in gensim/data.") + sys.exit(0) + print("Shortcut creation successful. The model/corpus can now be found" + " in gensim/data.") - try: - os.symlink(str(package_path),str(shortcut_path)) - except: - print("Shortcut creation failed in gensim/data.") - print("Shortcut creation successful. The model/corpus can now be found in gensim/data.") def info(file_name): - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) \ No newline at end of file + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) + else: + print("Incorrect model/corpus name.") From f567dee022be5aff13d8ca595f151e1cc67da464 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Fri, 4 Aug 2017 15:18:12 +0530 Subject: [PATCH 04/15] Updated download function --- gensim/__main__.py | 10 ------ gensim/api/__init__.py | 72 +++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 57 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 48f299b21e..33236f6ccb 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -4,7 +4,6 @@ import argparse from .api import download from .api import catalogue -from .api import link from .api import info if __name__ == '__main__': parser = argparse.ArgumentParser(description="Gensim console API") @@ -13,10 +12,6 @@ "-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d " "corpus/model name") - group.add_argument( - "-l", "--link", nargs=2, - help="To store a shortcut to a corpus/model : python -m gensim -l " - "source destination") group.add_argument( "-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i " @@ -27,12 +22,7 @@ args = parser.parse_args() if sys.argv[1] == "-d" or sys.argv[1] == "--download": download(sys.argv[2]) - - elif sys.argv[1] == "-l" or sys.argv[1] == "--link": - link(sys.argv[2], sys.argv[3]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 7c64011d52..557df6c46e 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -1,11 +1,14 @@ -import subprocess +from __future__ import print_function import json import sys import os -import pip -import importlib -from pathlib import Path -import os +import six +import tarfile +try: + import urllib.request as urllib +except ImportError: + import urllib + try: from urllib.request import urlopen except ImportError: @@ -13,22 +16,26 @@ def download(file): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/" - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file not in corpuses and file not in models: - print("Incorrect corpus/model name.") - else: - url = url+file+"/"+file+".tar.gz" - print("Downloading {m}".format(m=file)) - subprocess.call( - [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', url], - env=os.environ.copy()) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/" + url = url+"download/"+file+"/"+file+".tar.gz" + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + extracted_folder_dir = os.path.join(base_dir, file) + if not os.path.exists(base_dir): + os.makedirs(base_dir) + compressed_folder_name = file+".tar.gz" + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + urllib.urlretrieve(url, compressed_folder_dir) + if not os.path.exists(extracted_folder_dir): + os.makedirs(extracted_folder_dir) + tar = tarfile.open(compressed_folder_dir) + tar.extractall(extracted_folder_dir) + tar.close() def catalogue(print_list=True): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/" + url = url + "master/list.json" response = urlopen(url) data = response.read().decode("utf-8") if print_list == 1: @@ -46,35 +53,6 @@ def catalogue(print_list=True): return json.loads(data) -def link(file_name, shortcut_name): - packages = pip.get_installed_distributions() - package_is_installed = 0 - for package in packages: - if package.project_name == file_name: - package_is_installed = 1 - break - if package_is_installed == 0: - print("The model/corpus {f} has not been installed".format(f=file_name)) - print("For installing use: python -m gensim -d {f}".format(f=file_name)) - sys.exit(0) - package = importlib.import_module(file_name) - package_path = Path(package.__file__).parent.parent - package_path = package_path / file_name / file_name - gensim_path = importlib.import_module("gensim") - gensim_path = Path(gensim_path.__file__).parent - shortcut_path = gensim_path / 'data' / shortcut_name - if os.path.exists(str(shortcut_path)): - print("This shortcut link already exists.") - sys.exit(0) - try: - os.symlink(str(package_path), str(shortcut_path)) - except: - print("Shortcut creation failed in gensim/data.") - sys.exit(0) - print("Shortcut creation successful. The model/corpus can now be found" - " in gensim/data.") - - def info(file_name): data = catalogue(False) corpuses = data['gensim']['corpus'] From 61ba3d62b86ed1d1f017214e8a81cdc4df4a98b8 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Mon, 7 Aug 2017 11:25:05 +0530 Subject: [PATCH 05/15] Added logging --- gensim/api/__init__.py | 51 +++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 557df6c46e..099792fa5b 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -1,9 +1,11 @@ from __future__ import print_function + +import logging import json -import sys import os -import six import tarfile +import shutil +from ..utils import SaveLoad try: import urllib.request as urllib except ImportError: @@ -14,23 +16,52 @@ except ImportError: from urllib2 import urlopen +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename="api.log", level=logging.INFO) +console = logging.StreamHandler() +console.setLevel(logging.INFO) +logging.getLogger('').addHandler(console) + -def download(file): +def download(file_name): url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/" - url = url+"download/"+file+"/"+file+".tar.gz" + url = url + "download/" + file_name + "/" + file_name + ".tar.gz" user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') - extracted_folder_dir = os.path.join(base_dir, file) + extracted_folder_dir = os.path.join(base_dir, file_name) if not os.path.exists(base_dir): + logging.info("Creating {}".format(base_dir)) os.makedirs(base_dir) - compressed_folder_name = file+".tar.gz" + if os.path.exists(base_dir): + logging.info("Creation successful. Models/corpus can be accessed" + "via {}".format(base_dir)) + else: + logging.error("Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it manually".format(d=base_dir)) + compressed_folder_name = file_name + ".tar.gz" compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - urllib.urlretrieve(url, compressed_folder_dir) if not os.path.exists(extracted_folder_dir): + logging.info("Downloading {}".format(file_name)) + urllib.urlretrieve(url, compressed_folder_dir) + logging.info("{} downloaded".format(file_name)) + logging.info("Creating {}".format(extracted_folder_dir)) os.makedirs(extracted_folder_dir) - tar = tarfile.open(compressed_folder_dir) - tar.extractall(extracted_folder_dir) - tar.close() + if os.path.exists(extracted_folder_dir): + logging.info("Creation of {} successful" + ".".format(extracted_folder_dir)) + tar = tarfile.open(compressed_folder_dir) + logging.info("Extracting files from" + "{}".format(extracted_folder_dir)) + tar.extractall(extracted_folder_dir) + tar.close() + logging.info("{} installed".format(file_name)) + else: + logging.error("Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it " + "manually".format(d=extracted_folder_dir)) + else: + print("{} has already been installed".format(file_name)) def catalogue(print_list=True): From d8257a387237208fa5fe5add7380614e079e78f0 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Fri, 11 Aug 2017 11:19:19 +0530 Subject: [PATCH 06/15] Added load function --- gensim/__main__.py | 35 +++++++++++--------- gensim/api/__init__.py | 74 +++++++++++++++++++++++++++--------------- 2 files changed, 68 insertions(+), 41 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 33236f6ccb..5ea8e43000 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -2,27 +2,32 @@ from __future__ import absolute_import import sys import argparse -from .api import download -from .api import catalogue -from .api import info +import logging +from gensim.api import download +from gensim.api import catalogue +from gensim.api import info if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument( + logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', filename="api.log", level=logging.INFO) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + logging.getLogger('').addHandler(console) + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument( "-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d " "corpus/model name") - group.add_argument( + group.add_argument( "-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i " "model/corpus name") - group.add_argument( + group.add_argument( "-c", "--catalogue", help="To get the list of all models/corpus stored" " : python -m gensim -c", action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 099792fa5b..0a70fc2b39 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -1,11 +1,12 @@ from __future__ import print_function - -import logging +from __future__ import absolute_import import json import os import tarfile import shutil -from ..utils import SaveLoad +import logging +import sys +import importlib try: import urllib.request as urllib except ImportError: @@ -16,10 +17,7 @@ except ImportError: from urllib2 import urlopen -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename="api.log", level=logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -logging.getLogger('').addHandler(console) +logger = logging.getLogger('gensim.api') def download(file_name): @@ -29,37 +27,42 @@ def download(file_name): base_dir = os.path.join(user_dir, 'gensim-data') extracted_folder_dir = os.path.join(base_dir, file_name) if not os.path.exists(base_dir): - logging.info("Creating {}".format(base_dir)) + logger.info("Creating {}".format(base_dir)) os.makedirs(base_dir) if os.path.exists(base_dir): - logging.info("Creation successful. Models/corpus can be accessed" - "via {}".format(base_dir)) + logger.info( + "Creation successful. Models/corpus can be accessed" + "via {}".format(base_dir)) else: - logging.error("Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it manually".format(d=base_dir)) + logger.error( + "Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it manually".format(d=base_dir)) compressed_folder_name = file_name + ".tar.gz" compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) if not os.path.exists(extracted_folder_dir): - logging.info("Downloading {}".format(file_name)) + logger.info("Downloading {}".format(file_name)) urllib.urlretrieve(url, compressed_folder_dir) - logging.info("{} downloaded".format(file_name)) - logging.info("Creating {}".format(extracted_folder_dir)) + logger.info("{} downloaded".format(file_name)) + logger.info("Creating {}".format(extracted_folder_dir)) os.makedirs(extracted_folder_dir) if os.path.exists(extracted_folder_dir): - logging.info("Creation of {} successful" - ".".format(extracted_folder_dir)) + logger.info( + "Creation of {} successful" + ".".format(extracted_folder_dir)) tar = tarfile.open(compressed_folder_dir) - logging.info("Extracting files from" - "{}".format(extracted_folder_dir)) + logger.info( + "Extracting files from" + "{}".format(extracted_folder_dir)) tar.extractall(extracted_folder_dir) tar.close() - logging.info("{} installed".format(file_name)) + logger.info("{} installed".format(file_name)) else: - logging.error("Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it " - "manually".format(d=extracted_folder_dir)) + logger.error( + "Not able to create {d}. Make sure you have the " + "correct read/write permissions of the {d} or you " + "can try creating it " + "manually".format(d=extracted_folder_dir)) else: print("{} has already been installed".format(file_name)) @@ -69,7 +72,7 @@ def catalogue(print_list=True): url = url + "master/list.json" response = urlopen(url) data = response.read().decode("utf-8") - if print_list == 1: + if print_list: data = json.loads(data) corpuses = data['gensim']['corpus'] models = data['gensim']['model'] @@ -94,3 +97,22 @@ def info(file_name): print(data['gensim']['model'][file_name]) else: print("Incorrect model/corpus name.") + + +def load(file_name, return_path=False): + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + folder_dir = os.path.join(base_dir, file_name) + if not os.path.exists(folder_dir): + print( + "Incorrect model/corpus name. Use catalogue() to get a list of " + "avalible models/corpus. If the model/corpus name you entered is" + " in the catalogue, then please download the model/corpus by " + "calling download({f}) function".format(f=file_name)) + elif return_path: + return folder_dir + else: + sys.path.insert(0, base_dir) + module = __import__(file_name) + data = module.load_data() + return data From 55714696a233a3aa4f2a025880b7cf60fe18cb67 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Fri, 11 Aug 2017 12:11:10 +0530 Subject: [PATCH 07/15] Removed unused imports --- gensim/__main__.py | 34 ++++++++++++++++++---------------- gensim/api/__init__.py | 5 ++--- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/gensim/__main__.py b/gensim/__main__.py index 5ea8e43000..c6db55ab27 100644 --- a/gensim/__main__.py +++ b/gensim/__main__.py @@ -7,27 +7,29 @@ from gensim.api import catalogue from gensim.api import info if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', filename="api.log", level=logging.INFO) - console = logging.StreamHandler() - console.setLevel(logging.INFO) - logging.getLogger('').addHandler(console) - parser = argparse.ArgumentParser(description="Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument( + logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + filename="api.log", level=logging.INFO) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + logging.getLogger('').addHandler(console) + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument( "-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d " "corpus/model name") - group.add_argument( + group.add_argument( "-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i " "model/corpus name") - group.add_argument( + group.add_argument( "-c", "--catalogue", help="To get the list of all models/corpus stored" " : python -m gensim -c", action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() + args = parser.parse_args() + if sys.argv[1] == "-d" or sys.argv[1] == "--download": + download(sys.argv[2]) + elif sys.argv[1] == "-i" or sys.argv[1] == "--info": + info(sys.argv[2]) + elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": + catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 0a70fc2b39..990e90afcb 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -3,10 +3,9 @@ import json import os import tarfile -import shutil import logging import sys -import importlib + try: import urllib.request as urllib except ImportError: @@ -108,7 +107,7 @@ def load(file_name, return_path=False): "Incorrect model/corpus name. Use catalogue() to get a list of " "avalible models/corpus. If the model/corpus name you entered is" " in the catalogue, then please download the model/corpus by " - "calling download({f}) function".format(f=file_name)) + "calling download('{f}') function".format(f=file_name)) elif return_path: return folder_dir else: From cabf173a3b5f80465ef8ba3dab452fe8d197f45f Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Thu, 24 Aug 2017 00:17:28 +0530 Subject: [PATCH 08/15] added check for installed models --- gensim/__main__.py | 35 ------ gensim/api/__init__.py | 231 +++++++++++++++++++++++---------------- gensim/api/downloader.py | 19 ++++ 3 files changed, 158 insertions(+), 127 deletions(-) delete mode 100644 gensim/__main__.py create mode 100644 gensim/api/downloader.py diff --git a/gensim/__main__.py b/gensim/__main__.py deleted file mode 100644 index c6db55ab27..0000000000 --- a/gensim/__main__.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -import sys -import argparse -import logging -from gensim.api import download -from gensim.api import catalogue -from gensim.api import info -if __name__ == '__main__': - logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - filename="api.log", level=logging.INFO) - console = logging.StreamHandler() - console.setLevel(logging.INFO) - logging.getLogger('').addHandler(console) - parser = argparse.ArgumentParser(description="Gensim console API") - group = parser.add_mutually_exclusive_group() - group.add_argument( - "-d", "--download", nargs=1, - help="To download a corpus/model : python -m gensim -d " - "corpus/model name") - group.add_argument( - "-i", "--info", nargs=1, - help="To get information about a corpus/model : python -m gensim -i " - "model/corpus name") - group.add_argument( - "-c", "--catalogue", help="To get the list of all models/corpus stored" - " : python -m gensim -c", action="store_true") - args = parser.parse_args() - if sys.argv[1] == "-d" or sys.argv[1] == "--download": - download(sys.argv[2]) - elif sys.argv[1] == "-i" or sys.argv[1] == "--info": - info(sys.argv[2]) - elif sys.argv[1] == "-c" or sys.argv[1] == "--catalogue": - catalogue() diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 990e90afcb..334f336078 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -5,113 +5,160 @@ import tarfile import logging import sys - +import shutil +import errno try: - import urllib.request as urllib + import urllib.request as urllib except ImportError: - import urllib + import urllib try: - from urllib.request import urlopen + from urllib.request import urlopen except ImportError: - from urllib2 import urlopen + from urllib2 import urlopen + + +user_dir = os.path.expanduser('~') +base_dir = os.path.join(user_dir, 'gensim-data') +log_file_dir = os.path.join(base_dir, 'api.log') +if not os.path.isdir(base_dir): + try: + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) +logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + filename=log_file_dir, level=logging.INFO) +console = logging.StreamHandler() +console.setLevel(logging.INFO) logger = logging.getLogger('gensim.api') +logger.addHandler(console) def download(file_name): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/" - url = url + "download/" + file_name + "/" + file_name + ".tar.gz" - user_dir = os.path.expanduser('~') - base_dir = os.path.join(user_dir, 'gensim-data') - extracted_folder_dir = os.path.join(base_dir, file_name) - if not os.path.exists(base_dir): - logger.info("Creating {}".format(base_dir)) - os.makedirs(base_dir) - if os.path.exists(base_dir): - logger.info( - "Creation successful. Models/corpus can be accessed" - "via {}".format(base_dir)) - else: - logger.error( - "Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it manually".format(d=base_dir)) - compressed_folder_name = file_name + ".tar.gz" - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - if not os.path.exists(extracted_folder_dir): - logger.info("Downloading {}".format(file_name)) - urllib.urlretrieve(url, compressed_folder_dir) - logger.info("{} downloaded".format(file_name)) - logger.info("Creating {}".format(extracted_folder_dir)) - os.makedirs(extracted_folder_dir) - if os.path.exists(extracted_folder_dir): - logger.info( - "Creation of {} successful" - ".".format(extracted_folder_dir)) - tar = tarfile.open(compressed_folder_dir) - logger.info( - "Extracting files from" - "{}".format(extracted_folder_dir)) - tar.extractall(extracted_folder_dir) - tar.close() - logger.info("{} installed".format(file_name)) - else: - logger.error( - "Not able to create {d}. Make sure you have the " - "correct read/write permissions of the {d} or you " - "can try creating it " - "manually".format(d=extracted_folder_dir)) - else: - print("{} has already been installed".format(file_name)) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) + data_folder_dir = os.path.join(base_dir, file_name) + data = catalogue(print_list=False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name not in corpuses and file_name not in models: + logger.error( + "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" + " python -m gensim -c to get a list of models/corpuses" + " available.") + sys.exit(0) + compressed_folder_name = "{f}.tar.gz".format(f=file_name) + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + is_installed = False + is_downloaded = False + installed_message = "{f} installed".format(f=file_name) + downloaded_message = "{f} downloaded".format(f=file_name) + if os.path.exists(data_folder_dir): + log_file_dir = os.path.join(base_dir, 'api.log') + with open(log_file_dir) as f: + f = f.readlines() + for line in f: + if installed_message in line: + print("{} has already been installed".format(file_name)) + is_installed = True + sys.exit(0) + if os.path.exists(data_folder_dir) and not is_installed: + shutil.rmtree(data_folder_dir) + for line in f: + if downloaded_message in line: + is_downloaded = True + break + if not is_downloaded: + os.makedirs(data_folder_dir) + logger.info("Downloading %s", file_name) + urllib.urlretrieve(url, compressed_folder_dir) + data_url = data_links(file_name) + if data_url is not None: + index = data_url.rfind("/") + data_dir = os.path.join(data_folder_dir, data_url[index+1:]) + urllib.urlretrieve(data_url, data_dir) + logger.info("%s downloaded", file_name) + if not is_installed: + logger.info("Creating %s", data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + tar = tarfile.open(compressed_folder_dir) + logger.info("Extracting files from %s", data_folder_dir) + tar.extractall(data_folder_dir) + tar.close() + logger.info("%s installed", file_name) + else: + logger.error( + "Not able to create %s. Make sure you have the correct read/" + "write permissions for %s or you can try creating it manually", + data_folder_dir, base_dir) -def catalogue(print_list=True): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/" - url = url + "master/list.json" - response = urlopen(url) - data = response.read().decode("utf-8") - if print_list: - data = json.loads(data) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - else: - return json.loads(data) +def catalogue(print_list=False): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if print_list: + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) + return data def info(file_name): - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) - else: - print("Incorrect model/corpus name.") + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) + else: + catalogue(print_list=True) + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "above.") def load(file_name, return_path=False): - user_dir = os.path.expanduser('~') - base_dir = os.path.join(user_dir, 'gensim-data') - folder_dir = os.path.join(base_dir, file_name) - if not os.path.exists(folder_dir): - print( - "Incorrect model/corpus name. Use catalogue() to get a list of " - "avalible models/corpus. If the model/corpus name you entered is" - " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=file_name)) - elif return_path: - return folder_dir - else: - sys.path.insert(0, base_dir) - module = __import__(file_name) - data = module.load_data() - return data + user_dir = os.path.expanduser('~') + base_dir = os.path.join(user_dir, 'gensim-data') + folder_dir = os.path.join(base_dir, file_name) + if not os.path.exists(folder_dir): + raise Exception( + "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " + "avalible models/corpus. If the model/corpus name you entered is" + " in the catalogue, then please download the model/corpus by " + "calling download('{f}') function".format(f=file_name)) + elif return_path: + return folder_dir + else: + sys.path.insert(0, base_dir) + module = __import__(file_name) + data = module.load_data() + return data + + +def data_links(file_name): + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if file_name in data['data_links']: + return data['data_links'][file_name]['link'] diff --git a/gensim/api/downloader.py b/gensim/api/downloader.py new file mode 100644 index 0000000000..834628fe7f --- /dev/null +++ b/gensim/api/downloader.py @@ -0,0 +1,19 @@ +from __future__ import print_function +from __future__ import absolute_import +import argparse +from gensim.api import download +from gensim.api import catalogue +from gensim.api import info +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Gensim console API") + group = parser.add_mutually_exclusive_group() + group.add_argument("-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") + args = parser.parse_args() + if args.download is not None: + download(args.download[0]) + elif args.info is not None: + info(args.info[0]) + elif args.catalogue is not None: + catalogue(print_list=True) From 5d509fc92476b1fb621d25e5637a298a32c1ca15 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Sun, 27 Aug 2017 19:43:53 +0530 Subject: [PATCH 09/15] updated download function --- gensim/api/__init__.py | 246 ++++++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 124 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 334f336078..370af50573 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -5,39 +5,38 @@ import tarfile import logging import sys -import shutil import errno try: - import urllib.request as urllib + import urllib.request as urllib except ImportError: - import urllib + import urllib try: - from urllib.request import urlopen + from urllib.request import urlopen except ImportError: - from urllib2 import urlopen + from urllib2 import urlopen user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') log_file_dir = os.path.join(base_dir, 'api.log') if not os.path.isdir(base_dir): - try: - os.makedirs(base_dir) - except OSError as e: - if e.errno == errno.EEXIST: - raise Exception( - "Not able to create folder gensim-data in {}. File gensim-data " - "exists in the direcory already.".format(user_dir)) - else: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the folder manually" - .format(base_dir)) + try: + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - filename=log_file_dir, level=logging.INFO) + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + filename=log_file_dir, level=logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) logger = logging.getLogger('gensim.api') @@ -45,120 +44,119 @@ def download(file_name): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) - data_folder_dir = os.path.join(base_dir, file_name) - data = catalogue(print_list=False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name not in corpuses and file_name not in models: - logger.error( - "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" - " python -m gensim -c to get a list of models/corpuses" - " available.") - sys.exit(0) - compressed_folder_name = "{f}.tar.gz".format(f=file_name) - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - is_installed = False - is_downloaded = False - installed_message = "{f} installed".format(f=file_name) - downloaded_message = "{f} downloaded".format(f=file_name) - if os.path.exists(data_folder_dir): - log_file_dir = os.path.join(base_dir, 'api.log') - with open(log_file_dir) as f: - f = f.readlines() - for line in f: - if installed_message in line: - print("{} has already been installed".format(file_name)) - is_installed = True - sys.exit(0) - if os.path.exists(data_folder_dir) and not is_installed: - shutil.rmtree(data_folder_dir) - for line in f: - if downloaded_message in line: - is_downloaded = True - break - if not is_downloaded: - os.makedirs(data_folder_dir) - logger.info("Downloading %s", file_name) - urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(file_name) - if data_url is not None: - index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index+1:]) - urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", file_name) - if not is_installed: - logger.info("Creating %s", data_folder_dir) - if os.path.exists(data_folder_dir): - logger.info("Creation of %s successful.", data_folder_dir) - tar = tarfile.open(compressed_folder_dir) - logger.info("Extracting files from %s", data_folder_dir) - tar.extractall(data_folder_dir) - tar.close() - logger.info("%s installed", file_name) - else: - logger.error( - "Not able to create %s. Make sure you have the correct read/" - "write permissions for %s or you can try creating it manually", - data_folder_dir, base_dir) + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) + data_folder_dir = os.path.join(base_dir, file_name) + data = catalogue(print_list=False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name not in corpuses and file_name not in models: + logger.error( + "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" + " python -m gensim -c to get a list of models/corpuses" + " available.") + sys.exit(0) + compressed_folder_name = "{f}.tar.gz".format(f=file_name) + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + is_installed = False + is_downloaded = False + installed_message = "{f} installed".format(f=file_name) + downloaded_message = "{f} downloaded".format(f=file_name) + if os.path.exists(data_folder_dir): + log_file_dir = os.path.join(base_dir, 'api.log') + with open(log_file_dir) as f: + f = f.readlines() + for line in f: + if installed_message in line: + print("{} has already been installed".format(file_name)) + is_installed = True + sys.exit(0) + if os.path.exists(data_folder_dir) and not is_installed: + for line in f: + if downloaded_message in line: + is_downloaded = True + break + if not is_downloaded: + if not os.path.exists(data_folder_dir): + logger.info("Creating %s", data_folder_dir) + os.makedirs(data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + else: + logger.error( + "Not able to create %s. Make sure you have the correct read/" + "write permissions for %s or you can try creating it manually", + data_folder_dir, base_dir) + sys.exit(0) + logger.info("Downloading %s", file_name) + urllib.urlretrieve(url, compressed_folder_dir) + data_url = data_links(file_name) + if data_url is not None: + index = data_url.rfind("/") + data_dir = os.path.join(data_folder_dir, data_url[index+1:]) + urllib.urlretrieve(data_url, data_dir) + logger.info("%s downloaded", file_name) + if not is_installed: + tar = tarfile.open(compressed_folder_dir) + logger.info("Extracting files from %s", data_folder_dir) + tar.extractall(data_folder_dir) + tar.close() + logger.info("%s installed", file_name) def catalogue(print_list=False): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if print_list: - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - return data + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if print_list: + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + print("Corpuses available : ") + for corpus in corpuses: + print(corpus) + print("") + print("Models available : ") + for model in models: + print(model) + return data def info(file_name): - data = catalogue(False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) - else: - catalogue(print_list=True) - raise Exception( - "Incorrect model/corpus name. Choose the model/corpus from the list " - "above.") + data = catalogue(False) + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if file_name in corpuses: + print(data['gensim']['corpus'][file_name]) + elif file_name in models: + print(data['gensim']['model'][file_name]) + else: + catalogue(print_list=True) + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "above.") def load(file_name, return_path=False): - user_dir = os.path.expanduser('~') - base_dir = os.path.join(user_dir, 'gensim-data') - folder_dir = os.path.join(base_dir, file_name) - if not os.path.exists(folder_dir): - raise Exception( - "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " - "avalible models/corpus. If the model/corpus name you entered is" - " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=file_name)) - elif return_path: - return folder_dir - else: - sys.path.insert(0, base_dir) - module = __import__(file_name) - data = module.load_data() - return data + folder_dir = os.path.join(base_dir, file_name) + if not os.path.exists(folder_dir): + raise Exception( + "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " + "avalible models/corpus. If the model/corpus name you entered is" + " in the catalogue, then please download the model/corpus by " + "calling download('{f}') function".format(f=file_name)) + elif return_path: + return folder_dir + else: + sys.path.insert(0, base_dir) + module = __import__(file_name) + data = module.load_data() + return data def data_links(file_name): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if file_name in data['data_links']: - return data['data_links'][file_name]['link'] + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if file_name in data['data_links']: + return data['data_links'][file_name]['link'] From 551f54ef3f9ae53cb973cba16d0a6627dcdd674e Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Mon, 4 Sep 2017 22:11:32 +0530 Subject: [PATCH 10/15] Improved help for terminal --- gensim/api/__init__.py | 2 +- gensim/api/downloader.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 370af50573..15a3359936 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -92,7 +92,7 @@ def download(file_name): data_url = data_links(file_name) if data_url is not None: index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index+1:]) + data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) logger.info("%s downloaded", file_name) if not is_installed: diff --git a/gensim/api/downloader.py b/gensim/api/downloader.py index 834628fe7f..0accff9f8c 100644 --- a/gensim/api/downloader.py +++ b/gensim/api/downloader.py @@ -5,10 +5,10 @@ from gensim.api import catalogue from gensim.api import info if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API") + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") args = parser.parse_args() if args.download is not None: From ff5509ffdfa7b189379af995938509fdc6868dbc Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Wed, 6 Sep 2017 13:29:14 +0530 Subject: [PATCH 11/15] load returns model path --- gensim/api/__init__.py | 66 +++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py index 15a3359936..a1ad63ebb4 100644 --- a/gensim/api/__init__.py +++ b/gensim/api/__init__.py @@ -43,31 +43,31 @@ logger.addHandler(console) -def download(file_name): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=file_name) - data_folder_dir = os.path.join(base_dir, file_name) +def download(dataset): + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) + data_folder_dir = os.path.join(base_dir, dataset) data = catalogue(print_list=False) corpuses = data['gensim']['corpus'] models = data['gensim']['model'] - if file_name not in corpuses and file_name not in models: + if dataset not in corpuses and dataset not in models: logger.error( "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" " python -m gensim -c to get a list of models/corpuses" " available.") sys.exit(0) - compressed_folder_name = "{f}.tar.gz".format(f=file_name) + compressed_folder_name = "{f}.tar.gz".format(f=dataset) compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) is_installed = False is_downloaded = False - installed_message = "{f} installed".format(f=file_name) - downloaded_message = "{f} downloaded".format(f=file_name) + installed_message = "{f} installed".format(f=dataset) + downloaded_message = "{f} downloaded".format(f=dataset) if os.path.exists(data_folder_dir): log_file_dir = os.path.join(base_dir, 'api.log') with open(log_file_dir) as f: f = f.readlines() for line in f: if installed_message in line: - print("{} has already been installed".format(file_name)) + print("{} has already been installed".format(dataset)) is_installed = True sys.exit(0) if os.path.exists(data_folder_dir) and not is_installed: @@ -87,24 +87,24 @@ def download(file_name): "write permissions for %s or you can try creating it manually", data_folder_dir, base_dir) sys.exit(0) - logger.info("Downloading %s", file_name) + logger.info("Downloading %s", dataset) urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(file_name) + data_url = data_links(dataset) if data_url is not None: index = data_url.rfind("/") data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", file_name) + logger.info("%s downloaded", dataset) if not is_installed: tar = tarfile.open(compressed_folder_dir) logger.info("Extracting files from %s", data_folder_dir) tar.extractall(data_folder_dir) tar.close() - logger.info("%s installed", file_name) + logger.info("%s installed", dataset) def catalogue(print_list=False): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list.json" + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) @@ -121,14 +121,14 @@ def catalogue(print_list=False): return data -def info(file_name): - data = catalogue(False) +def info(dataset): + data = catalogue() corpuses = data['gensim']['corpus'] models = data['gensim']['model'] - if file_name in corpuses: - print(data['gensim']['corpus'][file_name]) - elif file_name in models: - print(data['gensim']['model'][file_name]) + if dataset in corpuses: + print(data['gensim']['corpus'][dataset]["desc"]) + elif dataset in models: + print(data['gensim']['model'][dataset]["desc"]) else: catalogue(print_list=True) raise Exception( @@ -136,27 +136,39 @@ def info(file_name): "above.") -def load(file_name, return_path=False): - folder_dir = os.path.join(base_dir, file_name) +def get_filename(dataset): + data = catalogue() + corpuses = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpuses: + return data['gensim']['corpus'][dataset]["filename"] + elif dataset in models: + return data['gensim']['model'][dataset]["filename"] + + +def load(dataset, return_path=False): + file_name = get_filename(dataset) + folder_dir = os.path.join(base_dir, dataset) + file_dir = os.path.join(folder_dir, file_name) if not os.path.exists(folder_dir): raise Exception( "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " "avalible models/corpus. If the model/corpus name you entered is" " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=file_name)) + "calling download('{f}') function".format(f=dataset)) elif return_path: - return folder_dir + return file_dir else: sys.path.insert(0, base_dir) - module = __import__(file_name) + module = __import__(dataset) data = module.load_data() return data -def data_links(file_name): +def data_links(dataset): url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if file_name in data['data_links']: - return data['data_links'][file_name]['link'] + if dataset in data['data_links']: + return data['data_links'][dataset]['link'] From e6540703ae9d4f3557fa3080f05ee4250e2bbc7a Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 19 Sep 2017 22:09:26 +0530 Subject: [PATCH 12/15] added jupyter notebook and merged code --- docs/notebooks/API_tutorial.ipynb | 402 ++++++++++++++++++++++++++++++ gensim/api/__init__.py | 174 ------------- gensim/api/downloader.py | 19 -- gensim/downloader.py | 298 ++++++++++++++++++++++ 4 files changed, 700 insertions(+), 193 deletions(-) create mode 100644 docs/notebooks/API_tutorial.ipynb delete mode 100644 gensim/api/__init__.py delete mode 100644 gensim/api/downloader.py create mode 100644 gensim/downloader.py diff --git a/docs/notebooks/API_tutorial.ipynb b/docs/notebooks/API_tutorial.ipynb new file mode 100644 index 0000000000..2f5b6d7bbd --- /dev/null +++ b/docs/notebooks/API_tutorial.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial for using Gensim's API for downloading corpuses/models\n", + "Let's start by importing the api module." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gensim.downloader as api" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, lets download the text8 corpus and load it." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:40:13,699 :gensim.api :INFO : Creating /home/chaitali/gensim-data/text8\n", + "2017-09-19 17:40:13,707 :gensim.api :INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", + "2017-09-19 17:40:13,713 :gensim.api :INFO : Downloading text8\n", + "2017-09-19 17:46:24,545 :gensim.api :INFO : text8 downloaded\n", + "2017-09-19 17:46:24,560 :gensim.api :INFO : Extracting files from /home/chaitali/gensim-data/text8\n", + "2017-09-19 17:46:26,676 :gensim.api :INFO : text8 installed\n" + ] + } + ], + "source": [ + "corpus = api.load('text8')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the corpus has been installed, let's create a word2vec model of our corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:46:39,030 :gensim.models.word2vec :INFO : collecting all words and their counts\n", + "2017-09-19 17:46:39,037 :gensim.models.word2vec :INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-09-19 17:46:46,104 :gensim.models.word2vec :INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", + "2017-09-19 17:46:46,105 :gensim.models.word2vec :INFO : Loading a fresh vocabulary\n", + "2017-09-19 17:46:46,393 :gensim.models.word2vec :INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", + "2017-09-19 17:46:46,394 :gensim.models.word2vec :INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", + "2017-09-19 17:46:46,607 :gensim.models.word2vec :INFO : deleting the raw counts dictionary of 253854 items\n", + "2017-09-19 17:46:46,618 :gensim.models.word2vec :INFO : sample=0.001 downsamples 38 most-common words\n", + "2017-09-19 17:46:46,620 :gensim.models.word2vec :INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", + "2017-09-19 17:46:46,621 :gensim.models.word2vec :INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", + "2017-09-19 17:46:46,946 :gensim.models.word2vec :INFO : resetting layer weights\n", + "2017-09-19 17:46:48,052 :gensim.models.word2vec :INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-09-19 17:46:49,058 :gensim.models.word2vec :INFO : PROGRESS: at 1.14% examples, 707464 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:50,070 :gensim.models.word2vec :INFO : PROGRESS: at 2.33% examples, 716418 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:51,081 :gensim.models.word2vec :INFO : PROGRESS: at 3.50% examples, 720064 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:52,084 :gensim.models.word2vec :INFO : PROGRESS: at 4.68% examples, 724069 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:53,087 :gensim.models.word2vec :INFO : PROGRESS: at 5.83% examples, 724165 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:54,090 :gensim.models.word2vec :INFO : PROGRESS: at 7.00% examples, 726206 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:46:55,094 :gensim.models.word2vec :INFO : PROGRESS: at 8.15% examples, 725286 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:56,096 :gensim.models.word2vec :INFO : PROGRESS: at 9.30% examples, 724925 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:46:57,114 :gensim.models.word2vec :INFO : PROGRESS: at 10.18% examples, 704674 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:58,130 :gensim.models.word2vec :INFO : PROGRESS: at 11.37% examples, 707549 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:46:59,136 :gensim.models.word2vec :INFO : PROGRESS: at 12.30% examples, 696048 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:00,140 :gensim.models.word2vec :INFO : PROGRESS: at 13.47% examples, 699081 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:01,150 :gensim.models.word2vec :INFO : PROGRESS: at 14.63% examples, 700492 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:02,151 :gensim.models.word2vec :INFO : PROGRESS: at 15.79% examples, 701182 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:03,156 :gensim.models.word2vec :INFO : PROGRESS: at 16.95% examples, 702555 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:04,161 :gensim.models.word2vec :INFO : PROGRESS: at 18.13% examples, 704501 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:05,162 :gensim.models.word2vec :INFO : PROGRESS: at 19.04% examples, 696025 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:06,176 :gensim.models.word2vec :INFO : PROGRESS: at 19.96% examples, 689054 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:07,183 :gensim.models.word2vec :INFO : PROGRESS: at 21.15% examples, 691295 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:08,185 :gensim.models.word2vec :INFO : PROGRESS: at 22.30% examples, 692243 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:09,192 :gensim.models.word2vec :INFO : PROGRESS: at 23.47% examples, 693696 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:10,211 :gensim.models.word2vec :INFO : PROGRESS: at 24.64% examples, 695112 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:11,219 :gensim.models.word2vec :INFO : PROGRESS: at 25.77% examples, 695601 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:12,237 :gensim.models.word2vec :INFO : PROGRESS: at 26.90% examples, 695914 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:13,241 :gensim.models.word2vec :INFO : PROGRESS: at 28.09% examples, 697735 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:14,260 :gensim.models.word2vec :INFO : PROGRESS: at 29.25% examples, 698509 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:15,260 :gensim.models.word2vec :INFO : PROGRESS: at 30.36% examples, 698320 words/s, in_qsize 5, out_qsize 2\n", + "2017-09-19 17:47:16,273 :gensim.models.word2vec :INFO : PROGRESS: at 31.15% examples, 690867 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:17,293 :gensim.models.word2vec :INFO : PROGRESS: at 31.72% examples, 679085 words/s, in_qsize 6, out_qsize 1\n", + "2017-09-19 17:47:18,311 :gensim.models.word2vec :INFO : PROGRESS: at 32.31% examples, 668474 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-19 17:47:19,318 :gensim.models.word2vec :INFO : PROGRESS: at 32.98% examples, 660311 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:20,320 :gensim.models.word2vec :INFO : PROGRESS: at 34.05% examples, 660626 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:21,328 :gensim.models.word2vec :INFO : PROGRESS: at 35.19% examples, 661837 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:22,350 :gensim.models.word2vec :INFO : PROGRESS: at 36.01% examples, 656869 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:23,358 :gensim.models.word2vec :INFO : PROGRESS: at 36.74% examples, 651028 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:24,365 :gensim.models.word2vec :INFO : PROGRESS: at 37.44% examples, 644886 words/s, in_qsize 3, out_qsize 1\n", + "2017-09-19 17:47:25,381 :gensim.models.word2vec :INFO : PROGRESS: at 38.11% examples, 638630 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:26,384 :gensim.models.word2vec :INFO : PROGRESS: at 38.73% examples, 631930 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-19 17:47:27,386 :gensim.models.word2vec :INFO : PROGRESS: at 39.56% examples, 629025 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:28,403 :gensim.models.word2vec :INFO : PROGRESS: at 40.40% examples, 626116 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:29,412 :gensim.models.word2vec :INFO : PROGRESS: at 41.48% examples, 626962 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:30,422 :gensim.models.word2vec :INFO : PROGRESS: at 42.68% examples, 629634 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:31,439 :gensim.models.word2vec :INFO : PROGRESS: at 43.62% examples, 628434 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:32,451 :gensim.models.word2vec :INFO : PROGRESS: at 44.30% examples, 623710 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:33,494 :gensim.models.word2vec :INFO : PROGRESS: at 45.24% examples, 622439 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:34,500 :gensim.models.word2vec :INFO : PROGRESS: at 46.14% examples, 621141 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:35,501 :gensim.models.word2vec :INFO : PROGRESS: at 47.16% examples, 621680 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:36,502 :gensim.models.word2vec :INFO : PROGRESS: at 47.96% examples, 619190 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:37,507 :gensim.models.word2vec :INFO : PROGRESS: at 49.01% examples, 619861 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:38,516 :gensim.models.word2vec :INFO : PROGRESS: at 49.65% examples, 615540 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:39,538 :gensim.models.word2vec :INFO : PROGRESS: at 50.32% examples, 611462 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:40,554 :gensim.models.word2vec :INFO : PROGRESS: at 51.06% examples, 608521 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:41,559 :gensim.models.word2vec :INFO : PROGRESS: at 52.02% examples, 608259 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:42,575 :gensim.models.word2vec :INFO : PROGRESS: at 52.82% examples, 606121 words/s, in_qsize 5, out_qsize 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:47:43,592 :gensim.models.word2vec :INFO : PROGRESS: at 53.52% examples, 602970 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:44,647 :gensim.models.word2vec :INFO : PROGRESS: at 54.27% examples, 600118 words/s, in_qsize 6, out_qsize 1\n", + "2017-09-19 17:47:45,650 :gensim.models.word2vec :INFO : PROGRESS: at 55.16% examples, 599088 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:46,652 :gensim.models.word2vec :INFO : PROGRESS: at 56.32% examples, 601102 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-19 17:47:47,668 :gensim.models.word2vec :INFO : PROGRESS: at 57.50% examples, 603159 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:48,671 :gensim.models.word2vec :INFO : PROGRESS: at 58.66% examples, 605176 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-19 17:47:49,677 :gensim.models.word2vec :INFO : PROGRESS: at 59.84% examples, 607140 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:50,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.02% examples, 609189 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:51,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.96% examples, 608709 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:47:52,688 :gensim.models.word2vec :INFO : PROGRESS: at 62.94% examples, 608671 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:53,697 :gensim.models.word2vec :INFO : PROGRESS: at 63.97% examples, 609169 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:54,707 :gensim.models.word2vec :INFO : PROGRESS: at 65.16% examples, 611179 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:55,708 :gensim.models.word2vec :INFO : PROGRESS: at 66.31% examples, 612923 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:56,709 :gensim.models.word2vec :INFO : PROGRESS: at 67.30% examples, 613086 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:57,718 :gensim.models.word2vec :INFO : PROGRESS: at 68.48% examples, 614770 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:47:58,725 :gensim.models.word2vec :INFO : PROGRESS: at 69.42% examples, 614374 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:47:59,730 :gensim.models.word2vec :INFO : PROGRESS: at 70.35% examples, 613877 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:00,753 :gensim.models.word2vec :INFO : PROGRESS: at 71.09% examples, 611666 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:48:01,767 :gensim.models.word2vec :INFO : PROGRESS: at 72.06% examples, 611557 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:02,774 :gensim.models.word2vec :INFO : PROGRESS: at 72.92% examples, 610505 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:03,778 :gensim.models.word2vec :INFO : PROGRESS: at 73.80% examples, 609723 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:04,784 :gensim.models.word2vec :INFO : PROGRESS: at 74.81% examples, 610019 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:05,787 :gensim.models.word2vec :INFO : PROGRESS: at 75.87% examples, 610421 words/s, in_qsize 3, out_qsize 2\n", + "2017-09-19 17:48:06,795 :gensim.models.word2vec :INFO : PROGRESS: at 76.87% examples, 610533 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:07,808 :gensim.models.word2vec :INFO : PROGRESS: at 78.06% examples, 612109 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:08,815 :gensim.models.word2vec :INFO : PROGRESS: at 79.07% examples, 612233 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:09,823 :gensim.models.word2vec :INFO : PROGRESS: at 80.27% examples, 613807 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:10,833 :gensim.models.word2vec :INFO : PROGRESS: at 81.26% examples, 613696 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:11,842 :gensim.models.word2vec :INFO : PROGRESS: at 82.45% examples, 615110 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:48:12,851 :gensim.models.word2vec :INFO : PROGRESS: at 83.61% examples, 616375 words/s, in_qsize 6, out_qsize 1\n", + "2017-09-19 17:48:13,853 :gensim.models.word2vec :INFO : PROGRESS: at 84.77% examples, 617712 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:14,879 :gensim.models.word2vec :INFO : PROGRESS: at 85.67% examples, 616893 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:15,890 :gensim.models.word2vec :INFO : PROGRESS: at 86.64% examples, 616837 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:16,906 :gensim.models.word2vec :INFO : PROGRESS: at 87.69% examples, 617193 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:17,915 :gensim.models.word2vec :INFO : PROGRESS: at 88.79% examples, 617970 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:18,917 :gensim.models.word2vec :INFO : PROGRESS: at 89.78% examples, 618000 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-19 17:48:19,918 :gensim.models.word2vec :INFO : PROGRESS: at 90.91% examples, 618999 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:20,922 :gensim.models.word2vec :INFO : PROGRESS: at 91.96% examples, 619363 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:21,936 :gensim.models.word2vec :INFO : PROGRESS: at 92.85% examples, 618640 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:22,943 :gensim.models.word2vec :INFO : PROGRESS: at 93.84% examples, 618597 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:23,958 :gensim.models.word2vec :INFO : PROGRESS: at 94.85% examples, 618687 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:24,974 :gensim.models.word2vec :INFO : PROGRESS: at 95.90% examples, 618756 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:25,978 :gensim.models.word2vec :INFO : PROGRESS: at 96.93% examples, 619000 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-19 17:48:26,980 :gensim.models.word2vec :INFO : PROGRESS: at 97.93% examples, 619074 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:27,982 :gensim.models.word2vec :INFO : PROGRESS: at 98.99% examples, 619431 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-19 17:48:28,823 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-09-19 17:48:28,830 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-09-19 17:48:28,836 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-09-19 17:48:28,837 :gensim.models.word2vec :INFO : training on 85026035 raw words (62526300 effective words) took 100.8s, 620401 effective words/s\n" + ] + } + ], + "source": [ + "from gensim.models.word2vec import Word2Vec\n", + "model = Word2Vec(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our word2vec model, let's find words that are similar to 'tree'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 17:48:45,837 :gensim.models.keyedvectors :INFO : precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "[('leaf', 0.7284336090087891),\n", + " ('trees', 0.7024068236351013),\n", + " ('bark', 0.6984879970550537),\n", + " ('fruit', 0.623538613319397),\n", + " ('flower', 0.6177238821983337),\n", + " ('nest', 0.6133654713630676),\n", + " ('garden', 0.5962027311325073),\n", + " ('avl', 0.5909914374351501),\n", + " ('cave', 0.5902420282363892),\n", + " ('pond', 0.5827507972717285)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar('tree')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the API to download many corpuses and models. You can get the list of all the models and corpuses that are provided, by using the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"gensim\": {\n", + " \"model\": {\n", + " \"Google_News_word2vec\": {\n", + " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", + " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", + " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", + " },\n", + " \"fasttext_eng_model\": {\n", + " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", + " \"filename\": \"wiki.en.vec\",\n", + " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", + " },\n", + " \"glove_common_crawl_42B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.42B.300d.zip\",\n", + " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", + " },\n", + " \"glove_common_crawl_840B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.840B.300d.zip\",\n", + " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", + " },\n", + " \"glove_wiki_gigaword_300d\": {\n", + " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.300d.txt\",\n", + " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", + " },\n", + " \"glove_wiki_gigaword_200d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.200d.txt\",\n", + " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", + " },\n", + " \"glove_wiki_gigaword_100d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.100d.txt\",\n", + " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", + " },\n", + " \"glove_wiki_gigaword_50d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.50d.txt\",\n", + " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", + " },\n", + " \"glove_twitter_200d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.200d.txt\",\n", + " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", + " },\n", + " \"glove_twitter_100d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.100d.txt\",\n", + " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", + " },\n", + " \"glove_twitter_50d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.50d.txt\",\n", + " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", + " },\n", + " \"glove_twitter_25d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.25d.txt\",\n", + " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", + " }\n", + " },\n", + " \"corpus\": {\n", + " \"text8\": {\n", + " \"desc\": \"Wikipedia English corpus\",\n", + " \"filename\": \"text8\",\n", + " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", + " }\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "dataset_list = api.info()\n", + "print(json.dumps(dataset_list, indent=4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get detailed information about the model/corpus, use:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-09-19 21:56:42,071 :gensim.api :INFO : This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. \n", + "\n" + ] + } + ], + "source": [ + "api.info('glove_common_crawl_42B')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, you do not want to load the corpus/model to memory. You would just want to get the path to the corpus/model. For that, use :" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "text8_path = api.load('text8', return_path=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/api/__init__.py b/gensim/api/__init__.py deleted file mode 100644 index a1ad63ebb4..0000000000 --- a/gensim/api/__init__.py +++ /dev/null @@ -1,174 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -import json -import os -import tarfile -import logging -import sys -import errno -try: - import urllib.request as urllib -except ImportError: - import urllib - -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen - - -user_dir = os.path.expanduser('~') -base_dir = os.path.join(user_dir, 'gensim-data') -log_file_dir = os.path.join(base_dir, 'api.log') -if not os.path.isdir(base_dir): - try: - os.makedirs(base_dir) - except OSError as e: - if e.errno == errno.EEXIST: - raise Exception( - "Not able to create folder gensim-data in {}. File gensim-data " - "exists in the direcory already.".format(user_dir)) - else: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the folder manually" - .format(base_dir)) - -logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - filename=log_file_dir, level=logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -logger = logging.getLogger('gensim.api') -logger.addHandler(console) - - -def download(dataset): - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) - data_folder_dir = os.path.join(base_dir, dataset) - data = catalogue(print_list=False) - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset not in corpuses and dataset not in models: - logger.error( - "Incorect Model/corpus name. Use catalogue(print_list=TRUE) or" - " python -m gensim -c to get a list of models/corpuses" - " available.") - sys.exit(0) - compressed_folder_name = "{f}.tar.gz".format(f=dataset) - compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - is_installed = False - is_downloaded = False - installed_message = "{f} installed".format(f=dataset) - downloaded_message = "{f} downloaded".format(f=dataset) - if os.path.exists(data_folder_dir): - log_file_dir = os.path.join(base_dir, 'api.log') - with open(log_file_dir) as f: - f = f.readlines() - for line in f: - if installed_message in line: - print("{} has already been installed".format(dataset)) - is_installed = True - sys.exit(0) - if os.path.exists(data_folder_dir) and not is_installed: - for line in f: - if downloaded_message in line: - is_downloaded = True - break - if not is_downloaded: - if not os.path.exists(data_folder_dir): - logger.info("Creating %s", data_folder_dir) - os.makedirs(data_folder_dir) - if os.path.exists(data_folder_dir): - logger.info("Creation of %s successful.", data_folder_dir) - else: - logger.error( - "Not able to create %s. Make sure you have the correct read/" - "write permissions for %s or you can try creating it manually", - data_folder_dir, base_dir) - sys.exit(0) - logger.info("Downloading %s", dataset) - urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(dataset) - if data_url is not None: - index = data_url.rfind("/") - data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) - urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", dataset) - if not is_installed: - tar = tarfile.open(compressed_folder_dir) - logger.info("Extracting files from %s", data_folder_dir) - tar.extractall(data_folder_dir) - tar.close() - logger.info("%s installed", dataset) - - -def catalogue(print_list=False): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if print_list: - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - print("Corpuses available : ") - for corpus in corpuses: - print(corpus) - print("") - print("Models available : ") - for model in models: - print(model) - return data - - -def info(dataset): - data = catalogue() - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpuses: - print(data['gensim']['corpus'][dataset]["desc"]) - elif dataset in models: - print(data['gensim']['model'][dataset]["desc"]) - else: - catalogue(print_list=True) - raise Exception( - "Incorrect model/corpus name. Choose the model/corpus from the list " - "above.") - - -def get_filename(dataset): - data = catalogue() - corpuses = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpuses: - return data['gensim']['corpus'][dataset]["filename"] - elif dataset in models: - return data['gensim']['model'][dataset]["filename"] - - -def load(dataset, return_path=False): - file_name = get_filename(dataset) - folder_dir = os.path.join(base_dir, dataset) - file_dir = os.path.join(folder_dir, file_name) - if not os.path.exists(folder_dir): - raise Exception( - "Incorrect model/corpus name. Use catalogue(print_list=True) to get a list of " - "avalible models/corpus. If the model/corpus name you entered is" - " in the catalogue, then please download the model/corpus by " - "calling download('{f}') function".format(f=dataset)) - elif return_path: - return file_dir - else: - sys.path.insert(0, base_dir) - module = __import__(dataset) - data = module.load_data() - return data - - -def data_links(dataset): - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" - response = urlopen(url) - data = response.read().decode("utf-8") - data = json.loads(data) - if dataset in data['data_links']: - return data['data_links'][dataset]['link'] diff --git a/gensim/api/downloader.py b/gensim/api/downloader.py deleted file mode 100644 index 0accff9f8c..0000000000 --- a/gensim/api/downloader.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -import argparse -from gensim.api import download -from gensim.api import catalogue -from gensim.api import info -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") - group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") - group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") - args = parser.parse_args() - if args.download is not None: - download(args.download[0]) - elif args.info is not None: - info(args.info[0]) - elif args.catalogue is not None: - catalogue(print_list=True) diff --git a/gensim/downloader.py b/gensim/downloader.py new file mode 100644 index 0000000000..8edd3062b6 --- /dev/null +++ b/gensim/downloader.py @@ -0,0 +1,298 @@ +from __future__ import absolute_import +import argparse +import json +import os +import tarfile +import logging +import sys +import errno +import hashlib +try: + import urllib.request as urllib +except ImportError: + import urllib + +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen + +user_dir = os.path.expanduser('~') +base_dir = os.path.join(user_dir, 'gensim-data') +data_log_file_dir = os.path.join(base_dir, 'data.json') + +logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', + stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger('gensim.api') + +if not os.path.isdir(base_dir): + try: + logger.info("Creating %s", base_dir) + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) + + +def initialize_data_log_file(): + """Function for initializing the log file. Creates a json object + for each corpus/model and stores in the log file. For eg: {"name": "text8", "status" : "None"} + """ + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + json_list = [] + for corpus in corpora: + json_object = {"name": corpus, "status": "None"} + json_list.append(json_object) + for model in models: + json_object = {"name": model, "status": "None"} + json_list.append(json_object) + json.dump(json_list, data_log_file) + data_log_file.close() + + +def update_data_log_file(dataset, status): + """Function for updating the status of the dataset json object. + + Args: + dataset(string): Name of the corpus/model. + status(string): Status to be updates to i.e downloaded or installed. + """ + jdata = json.loads(open(data_log_file_dir).read()) + for json_object in jdata: + if json_object["name"] == dataset: + json_object["status"] = status + with open(data_log_file_dir, 'w') as f: + f.write(json.dumps(jdata)) + + +def get_data_status(dataset): + """Function for finding the status of the dataset. + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: returns the current status of the corpus/model i.e None, downloaded or installed. + """ + jdata = json.loads(open(data_log_file_dir).read()) + for json_object in jdata: + if json_object["name"] == dataset: + return json_object["status"] + + +def calculate_md5_checksum(folder_dir): + """Function for calculating checksum of a downloaded model/corpus. + + Args: + folder_dir(string): Path to the downloaded model. + + Returns: + string: It returns the value for the checksum for folder_dir directory + """ + hash_md5 = hashlib.md5() + for filename in os.listdir(folder_dir): + file_dir = os.path.join(folder_dir, filename) + with open(file_dir, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def info(dataset=None): + """Function for retrieving the list of corpora/models, if dataset is not provided. If dataset + is provided, then it gives detailed information about the dataset. + + Args: + dataset(string): Name of the corpus/model. + + Returns: + : It returns the models/corpora names with detailed information about each. + """ + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if dataset is not None: + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpora: + logger.info("%s \n", data['gensim']['corpus'][dataset]["desc"]) + elif dataset in models: + logger.info("%s \n", data['gensim']['model'][dataset]["desc"]) + else: + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "\n {}".format(json.dumps(data, indent=4))) + else: + return data + + +def get_checksum(dataset): + """Function for retrieving the checksum of a corpus/model + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: It returns the checksum for corresponding the corpus/model. + """ + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpora: + return data['gensim']['corpus'][dataset]["checksum"] + elif dataset in models: + return data['gensim']['model'][dataset]["checksum"] + + +if not os.path.isfile(data_log_file_dir): + try: + logger.warning("Creating %s", data_log_file_dir) + data_log_file = open(data_log_file_dir, 'a') + initialize_data_log_file() + except: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the file manually" + .format(data_log_file_dir)) + + +def _download(dataset): + """Function for downloading and installed dataset depending upon it's current status. + + Args: + dataset(string): Name of the corpus/model. + """ + url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) + data_folder_dir = os.path.join(base_dir, dataset) + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset not in corpora and dataset not in models: + raise Exception( + "Incorect Model/corpus name. Use info() or" + " python -m gensim.downloader -c to get a list of models/corpora" + " available.") + compressed_folder_name = "{f}.tar.gz".format(f=dataset) + compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) + if get_data_status(dataset) != "downloaded": + if not os.path.exists(data_folder_dir): + logger.info("Creating %s", data_folder_dir) + os.makedirs(data_folder_dir) + if os.path.exists(data_folder_dir): + logger.info("Creation of %s successful.", data_folder_dir) + else: + raise Exception( + "Not able to create {a}. Make sure you have the correct read/" + "write permissions for {b} or you can try creating it manually". + format(a=data_folder_dir, b=base_dir)) + logger.info("Downloading %s", dataset) + urllib.urlretrieve(url, compressed_folder_dir) + data_url = data_links(dataset) + if data_url is not None: + index = data_url.rfind("/") + data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) + urllib.urlretrieve(data_url, data_dir) + logger.info("%s downloaded", dataset) + update_data_log_file(dataset, status="downloaded") + if get_data_status(dataset) != "installed": + tar = tarfile.open(compressed_folder_dir) + logger.info("Extracting files from %s", data_folder_dir) + tar.extractall(data_folder_dir) + tar.close() + if calculate_md5_checksum(data_folder_dir) == get_checksum(dataset): + update_data_log_file(dataset, status="installed") + logger.info("%s installed", dataset) + else: + logger.error("There was a problem in installing the file. Retrying.") + _download(dataset) + + +def get_filename(dataset): + """Function of retrieving the filename of corpus/model. + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: Returns the filename of the model/corpus. + """ + data = info() + corpora = data['gensim']['corpus'] + models = data['gensim']['model'] + if dataset in corpora: + return data['gensim']['corpus'][dataset]["filename"] + elif dataset in models: + return data['gensim']['model'][dataset]["filename"] + + +def load(dataset, return_path=False): + """Loads the corpus/model to the memory, if return_path is False. + + Args: + dataset(string): Name of the corpus/model. + return_path(bool): Determines whether to return model/corpus file path. + + Returns: + string: Returns the path to the model/corpus, if return_path is True. + """ + file_name = get_filename(dataset) + if file_name is None: + raise Exception( + "Incorrect model/corpus name. Choose the model/corpus from the list " + "\n {}".format(json.dumps(info(), indent=4))) + folder_dir = os.path.join(base_dir, dataset) + file_dir = os.path.join(folder_dir, file_name) + if not os.path.exists(folder_dir) or get_data_status(dataset) != "installed": + _download(dataset) + if return_path: + return file_dir + else: + sys.path.insert(0, base_dir) + module = __import__(dataset) + data = module.load_data() + return data + + +def data_links(dataset): + """Function for retrieving the links of the models/corpus which are not stored in github releases + + Args: + dataset(string): Name of the corpus/model. + + Returns: + string: Returns the link of the model/corpus. + """ + url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + if dataset in data['data_links']: + return data['data_links'][dataset]['link'] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") + group = parser.add_mutually_exclusive_group() + group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") + args = parser.parse_args() + if args.download is not None: + data_path = load(args.download[0], return_path=True) + logger.info("Data has been installed and data path is %s", data_path) + elif args.info is not None: + info(dataset=args.info[0]) + elif args.catalogue is not None: + data = info() + logger.info("%s\n", json.dumps(data, indent=4)) From b0d1110fb4f51b06821285f57a168212a1b7b70b Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 3 Oct 2017 18:27:17 +0530 Subject: [PATCH 13/15] alternate names for load --- docs/notebooks/API_tutorial.ipynb | 424 +++++++++++++++--------------- gensim/downloader.py | 272 +++++++++++-------- 2 files changed, 368 insertions(+), 328 deletions(-) diff --git a/docs/notebooks/API_tutorial.ipynb b/docs/notebooks/API_tutorial.ipynb index 2f5b6d7bbd..5b9d28c59b 100644 --- a/docs/notebooks/API_tutorial.ipynb +++ b/docs/notebooks/API_tutorial.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -28,23 +28,25 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:40:13,699 :gensim.api :INFO : Creating /home/chaitali/gensim-data/text8\n", - "2017-09-19 17:40:13,707 :gensim.api :INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", - "2017-09-19 17:40:13,713 :gensim.api :INFO : Downloading text8\n", - "2017-09-19 17:46:24,545 :gensim.api :INFO : text8 downloaded\n", - "2017-09-19 17:46:24,560 :gensim.api :INFO : Extracting files from /home/chaitali/gensim-data/text8\n", - "2017-09-19 17:46:26,676 :gensim.api :INFO : text8 installed\n" + "2017-09-30 20:00:53,429 : INFO : Creating /home/chaitali/gensim-data/text8\n", + "2017-09-30 20:00:53,431 : INFO : Creation of /home/chaitali/gensim-data/text8 successful.\n", + "2017-09-30 20:00:53,433 : INFO : Downloading text8\n", + "2017-09-30 20:04:46,938 : INFO : text8 downloaded\n", + "2017-09-30 20:04:46,951 : INFO : Extracting files from /home/chaitali/gensim-data/text8\n", + "2017-09-30 20:04:48,888 : INFO : text8 installed\n" ] } ], "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", "corpus = api.load('text8')" ] }, @@ -57,134 +59,124 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:46:39,030 :gensim.models.word2vec :INFO : collecting all words and their counts\n", - "2017-09-19 17:46:39,037 :gensim.models.word2vec :INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", - "2017-09-19 17:46:46,104 :gensim.models.word2vec :INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", - "2017-09-19 17:46:46,105 :gensim.models.word2vec :INFO : Loading a fresh vocabulary\n", - "2017-09-19 17:46:46,393 :gensim.models.word2vec :INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", - "2017-09-19 17:46:46,394 :gensim.models.word2vec :INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", - "2017-09-19 17:46:46,607 :gensim.models.word2vec :INFO : deleting the raw counts dictionary of 253854 items\n", - "2017-09-19 17:46:46,618 :gensim.models.word2vec :INFO : sample=0.001 downsamples 38 most-common words\n", - "2017-09-19 17:46:46,620 :gensim.models.word2vec :INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", - "2017-09-19 17:46:46,621 :gensim.models.word2vec :INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", - "2017-09-19 17:46:46,946 :gensim.models.word2vec :INFO : resetting layer weights\n", - "2017-09-19 17:46:48,052 :gensim.models.word2vec :INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", - "2017-09-19 17:46:49,058 :gensim.models.word2vec :INFO : PROGRESS: at 1.14% examples, 707464 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:50,070 :gensim.models.word2vec :INFO : PROGRESS: at 2.33% examples, 716418 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:51,081 :gensim.models.word2vec :INFO : PROGRESS: at 3.50% examples, 720064 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:52,084 :gensim.models.word2vec :INFO : PROGRESS: at 4.68% examples, 724069 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:53,087 :gensim.models.word2vec :INFO : PROGRESS: at 5.83% examples, 724165 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:54,090 :gensim.models.word2vec :INFO : PROGRESS: at 7.00% examples, 726206 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:46:55,094 :gensim.models.word2vec :INFO : PROGRESS: at 8.15% examples, 725286 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:56,096 :gensim.models.word2vec :INFO : PROGRESS: at 9.30% examples, 724925 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:46:57,114 :gensim.models.word2vec :INFO : PROGRESS: at 10.18% examples, 704674 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:58,130 :gensim.models.word2vec :INFO : PROGRESS: at 11.37% examples, 707549 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:46:59,136 :gensim.models.word2vec :INFO : PROGRESS: at 12.30% examples, 696048 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:00,140 :gensim.models.word2vec :INFO : PROGRESS: at 13.47% examples, 699081 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:01,150 :gensim.models.word2vec :INFO : PROGRESS: at 14.63% examples, 700492 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:02,151 :gensim.models.word2vec :INFO : PROGRESS: at 15.79% examples, 701182 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:03,156 :gensim.models.word2vec :INFO : PROGRESS: at 16.95% examples, 702555 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:04,161 :gensim.models.word2vec :INFO : PROGRESS: at 18.13% examples, 704501 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:05,162 :gensim.models.word2vec :INFO : PROGRESS: at 19.04% examples, 696025 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:06,176 :gensim.models.word2vec :INFO : PROGRESS: at 19.96% examples, 689054 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:07,183 :gensim.models.word2vec :INFO : PROGRESS: at 21.15% examples, 691295 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:08,185 :gensim.models.word2vec :INFO : PROGRESS: at 22.30% examples, 692243 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:09,192 :gensim.models.word2vec :INFO : PROGRESS: at 23.47% examples, 693696 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:10,211 :gensim.models.word2vec :INFO : PROGRESS: at 24.64% examples, 695112 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:11,219 :gensim.models.word2vec :INFO : PROGRESS: at 25.77% examples, 695601 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:12,237 :gensim.models.word2vec :INFO : PROGRESS: at 26.90% examples, 695914 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:13,241 :gensim.models.word2vec :INFO : PROGRESS: at 28.09% examples, 697735 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:14,260 :gensim.models.word2vec :INFO : PROGRESS: at 29.25% examples, 698509 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:15,260 :gensim.models.word2vec :INFO : PROGRESS: at 30.36% examples, 698320 words/s, in_qsize 5, out_qsize 2\n", - "2017-09-19 17:47:16,273 :gensim.models.word2vec :INFO : PROGRESS: at 31.15% examples, 690867 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:17,293 :gensim.models.word2vec :INFO : PROGRESS: at 31.72% examples, 679085 words/s, in_qsize 6, out_qsize 1\n", - "2017-09-19 17:47:18,311 :gensim.models.word2vec :INFO : PROGRESS: at 32.31% examples, 668474 words/s, in_qsize 5, out_qsize 1\n", - "2017-09-19 17:47:19,318 :gensim.models.word2vec :INFO : PROGRESS: at 32.98% examples, 660311 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:20,320 :gensim.models.word2vec :INFO : PROGRESS: at 34.05% examples, 660626 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:21,328 :gensim.models.word2vec :INFO : PROGRESS: at 35.19% examples, 661837 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:22,350 :gensim.models.word2vec :INFO : PROGRESS: at 36.01% examples, 656869 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:23,358 :gensim.models.word2vec :INFO : PROGRESS: at 36.74% examples, 651028 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:24,365 :gensim.models.word2vec :INFO : PROGRESS: at 37.44% examples, 644886 words/s, in_qsize 3, out_qsize 1\n", - "2017-09-19 17:47:25,381 :gensim.models.word2vec :INFO : PROGRESS: at 38.11% examples, 638630 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:26,384 :gensim.models.word2vec :INFO : PROGRESS: at 38.73% examples, 631930 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-19 17:47:27,386 :gensim.models.word2vec :INFO : PROGRESS: at 39.56% examples, 629025 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:28,403 :gensim.models.word2vec :INFO : PROGRESS: at 40.40% examples, 626116 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:29,412 :gensim.models.word2vec :INFO : PROGRESS: at 41.48% examples, 626962 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:30,422 :gensim.models.word2vec :INFO : PROGRESS: at 42.68% examples, 629634 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:31,439 :gensim.models.word2vec :INFO : PROGRESS: at 43.62% examples, 628434 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:32,451 :gensim.models.word2vec :INFO : PROGRESS: at 44.30% examples, 623710 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:33,494 :gensim.models.word2vec :INFO : PROGRESS: at 45.24% examples, 622439 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:34,500 :gensim.models.word2vec :INFO : PROGRESS: at 46.14% examples, 621141 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:35,501 :gensim.models.word2vec :INFO : PROGRESS: at 47.16% examples, 621680 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:36,502 :gensim.models.word2vec :INFO : PROGRESS: at 47.96% examples, 619190 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:37,507 :gensim.models.word2vec :INFO : PROGRESS: at 49.01% examples, 619861 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:38,516 :gensim.models.word2vec :INFO : PROGRESS: at 49.65% examples, 615540 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:39,538 :gensim.models.word2vec :INFO : PROGRESS: at 50.32% examples, 611462 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:40,554 :gensim.models.word2vec :INFO : PROGRESS: at 51.06% examples, 608521 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:41,559 :gensim.models.word2vec :INFO : PROGRESS: at 52.02% examples, 608259 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:42,575 :gensim.models.word2vec :INFO : PROGRESS: at 52.82% examples, 606121 words/s, in_qsize 5, out_qsize 1\n" + "2017-09-30 20:04:59,672 : INFO : collecting all words and their counts\n", + "2017-09-30 20:04:59,677 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-09-30 20:05:06,425 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", + "2017-09-30 20:05:06,426 : INFO : Loading a fresh vocabulary\n", + "2017-09-30 20:05:06,711 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", + "2017-09-30 20:05:06,711 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", + "2017-09-30 20:05:06,925 : INFO : deleting the raw counts dictionary of 253854 items\n", + "2017-09-30 20:05:06,935 : INFO : sample=0.001 downsamples 38 most-common words\n", + "2017-09-30 20:05:06,936 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", + "2017-09-30 20:05:06,938 : INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", + "2017-09-30 20:05:07,267 : INFO : resetting layer weights\n", + "2017-09-30 20:05:08,240 : INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-09-30 20:05:09,250 : INFO : PROGRESS: at 1.09% examples, 677048 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:10,260 : INFO : PROGRESS: at 2.22% examples, 682662 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:11,266 : INFO : PROGRESS: at 3.35% examples, 688976 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:12,281 : INFO : PROGRESS: at 4.47% examples, 688776 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:13,283 : INFO : PROGRESS: at 5.58% examples, 692300 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:14,291 : INFO : PROGRESS: at 6.71% examples, 695154 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:15,298 : INFO : PROGRESS: at 7.83% examples, 695477 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:16,302 : INFO : PROGRESS: at 8.94% examples, 694913 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:17,320 : INFO : PROGRESS: at 10.04% examples, 693561 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:18,334 : INFO : PROGRESS: at 11.17% examples, 694297 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:19,349 : INFO : PROGRESS: at 12.28% examples, 693175 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:20,357 : INFO : PROGRESS: at 13.39% examples, 693211 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:21,369 : INFO : PROGRESS: at 14.52% examples, 693794 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-30 20:05:22,377 : INFO : PROGRESS: at 15.57% examples, 689525 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:23,377 : INFO : PROGRESS: at 16.68% examples, 689973 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:24,391 : INFO : PROGRESS: at 17.72% examples, 686717 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:25,398 : INFO : PROGRESS: at 18.86% examples, 687798 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:26,409 : INFO : PROGRESS: at 20.00% examples, 688486 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:27,411 : INFO : PROGRESS: at 21.15% examples, 689967 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:28,424 : INFO : PROGRESS: at 22.27% examples, 689469 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:29,432 : INFO : PROGRESS: at 23.41% examples, 690341 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:30,440 : INFO : PROGRESS: at 24.53% examples, 690470 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:31,454 : INFO : PROGRESS: at 25.64% examples, 690757 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:32,460 : INFO : PROGRESS: at 26.76% examples, 691272 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:33,468 : INFO : PROGRESS: at 27.89% examples, 691773 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:34,481 : INFO : PROGRESS: at 29.02% examples, 692058 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:35,491 : INFO : PROGRESS: at 30.14% examples, 692208 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-30 20:05:36,501 : INFO : PROGRESS: at 31.23% examples, 691812 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:37,507 : INFO : PROGRESS: at 32.35% examples, 691913 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:38,527 : INFO : PROGRESS: at 33.47% examples, 691941 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:39,534 : INFO : PROGRESS: at 34.59% examples, 692104 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:40,535 : INFO : PROGRESS: at 35.73% examples, 692232 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:41,542 : INFO : PROGRESS: at 36.83% examples, 691803 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:42,549 : INFO : PROGRESS: at 37.95% examples, 692107 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:43,561 : INFO : PROGRESS: at 39.08% examples, 692095 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:44,572 : INFO : PROGRESS: at 40.21% examples, 692168 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:45,574 : INFO : PROGRESS: at 41.35% examples, 692460 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:46,586 : INFO : PROGRESS: at 42.46% examples, 692126 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:47,597 : INFO : PROGRESS: at 43.61% examples, 692641 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:48,599 : INFO : PROGRESS: at 44.71% examples, 692672 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:49,609 : INFO : PROGRESS: at 45.81% examples, 692414 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:50,613 : INFO : PROGRESS: at 46.89% examples, 692196 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:51,614 : INFO : PROGRESS: at 47.98% examples, 692082 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:05:52,615 : INFO : PROGRESS: at 49.08% examples, 691919 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:05:53,622 : INFO : PROGRESS: at 50.18% examples, 691886 words/s, in_qsize 5, out_qsize 2\n", + "2017-09-30 20:05:54,631 : INFO : PROGRESS: at 51.30% examples, 691985 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:55,631 : INFO : PROGRESS: at 52.39% examples, 691836 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:56,644 : INFO : PROGRESS: at 53.50% examples, 691643 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:57,650 : INFO : PROGRESS: at 54.61% examples, 691789 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:58,651 : INFO : PROGRESS: at 55.72% examples, 691433 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:05:59,651 : INFO : PROGRESS: at 56.81% examples, 691258 words/s, in_qsize 2, out_qsize 1\n", + "2017-09-30 20:06:00,662 : INFO : PROGRESS: at 57.93% examples, 691247 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:01,666 : INFO : PROGRESS: at 59.05% examples, 691236 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:02,674 : INFO : PROGRESS: at 60.16% examples, 691214 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:03,692 : INFO : PROGRESS: at 61.31% examples, 691243 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:04,693 : INFO : PROGRESS: at 62.41% examples, 691156 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:05,706 : INFO : PROGRESS: at 63.54% examples, 691252 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:06,726 : INFO : PROGRESS: at 64.66% examples, 691169 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:07,742 : INFO : PROGRESS: at 65.77% examples, 691212 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:08,747 : INFO : PROGRESS: at 66.88% examples, 691305 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:09,752 : INFO : PROGRESS: at 67.98% examples, 691292 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:10,761 : INFO : PROGRESS: at 69.09% examples, 691231 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:11,767 : INFO : PROGRESS: at 70.21% examples, 691345 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:12,776 : INFO : PROGRESS: at 71.30% examples, 691192 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:13,779 : INFO : PROGRESS: at 72.39% examples, 691076 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:14,783 : INFO : PROGRESS: at 73.50% examples, 691043 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:15,799 : INFO : PROGRESS: at 74.60% examples, 690947 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:16,804 : INFO : PROGRESS: at 75.72% examples, 690751 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:17,823 : INFO : PROGRESS: at 76.83% examples, 690552 words/s, in_qsize 5, out_qsize 0\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:47:43,592 :gensim.models.word2vec :INFO : PROGRESS: at 53.52% examples, 602970 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:44,647 :gensim.models.word2vec :INFO : PROGRESS: at 54.27% examples, 600118 words/s, in_qsize 6, out_qsize 1\n", - "2017-09-19 17:47:45,650 :gensim.models.word2vec :INFO : PROGRESS: at 55.16% examples, 599088 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:46,652 :gensim.models.word2vec :INFO : PROGRESS: at 56.32% examples, 601102 words/s, in_qsize 4, out_qsize 0\n", - "2017-09-19 17:47:47,668 :gensim.models.word2vec :INFO : PROGRESS: at 57.50% examples, 603159 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:48,671 :gensim.models.word2vec :INFO : PROGRESS: at 58.66% examples, 605176 words/s, in_qsize 5, out_qsize 1\n", - "2017-09-19 17:47:49,677 :gensim.models.word2vec :INFO : PROGRESS: at 59.84% examples, 607140 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:50,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.02% examples, 609189 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:51,686 :gensim.models.word2vec :INFO : PROGRESS: at 61.96% examples, 608709 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:47:52,688 :gensim.models.word2vec :INFO : PROGRESS: at 62.94% examples, 608671 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:53,697 :gensim.models.word2vec :INFO : PROGRESS: at 63.97% examples, 609169 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:54,707 :gensim.models.word2vec :INFO : PROGRESS: at 65.16% examples, 611179 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:55,708 :gensim.models.word2vec :INFO : PROGRESS: at 66.31% examples, 612923 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:56,709 :gensim.models.word2vec :INFO : PROGRESS: at 67.30% examples, 613086 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:57,718 :gensim.models.word2vec :INFO : PROGRESS: at 68.48% examples, 614770 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:47:58,725 :gensim.models.word2vec :INFO : PROGRESS: at 69.42% examples, 614374 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:47:59,730 :gensim.models.word2vec :INFO : PROGRESS: at 70.35% examples, 613877 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:00,753 :gensim.models.word2vec :INFO : PROGRESS: at 71.09% examples, 611666 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:48:01,767 :gensim.models.word2vec :INFO : PROGRESS: at 72.06% examples, 611557 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:02,774 :gensim.models.word2vec :INFO : PROGRESS: at 72.92% examples, 610505 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:03,778 :gensim.models.word2vec :INFO : PROGRESS: at 73.80% examples, 609723 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:04,784 :gensim.models.word2vec :INFO : PROGRESS: at 74.81% examples, 610019 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:05,787 :gensim.models.word2vec :INFO : PROGRESS: at 75.87% examples, 610421 words/s, in_qsize 3, out_qsize 2\n", - "2017-09-19 17:48:06,795 :gensim.models.word2vec :INFO : PROGRESS: at 76.87% examples, 610533 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:07,808 :gensim.models.word2vec :INFO : PROGRESS: at 78.06% examples, 612109 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:08,815 :gensim.models.word2vec :INFO : PROGRESS: at 79.07% examples, 612233 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:09,823 :gensim.models.word2vec :INFO : PROGRESS: at 80.27% examples, 613807 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:10,833 :gensim.models.word2vec :INFO : PROGRESS: at 81.26% examples, 613696 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:11,842 :gensim.models.word2vec :INFO : PROGRESS: at 82.45% examples, 615110 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:48:12,851 :gensim.models.word2vec :INFO : PROGRESS: at 83.61% examples, 616375 words/s, in_qsize 6, out_qsize 1\n", - "2017-09-19 17:48:13,853 :gensim.models.word2vec :INFO : PROGRESS: at 84.77% examples, 617712 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:14,879 :gensim.models.word2vec :INFO : PROGRESS: at 85.67% examples, 616893 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:15,890 :gensim.models.word2vec :INFO : PROGRESS: at 86.64% examples, 616837 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:16,906 :gensim.models.word2vec :INFO : PROGRESS: at 87.69% examples, 617193 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:17,915 :gensim.models.word2vec :INFO : PROGRESS: at 88.79% examples, 617970 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:18,917 :gensim.models.word2vec :INFO : PROGRESS: at 89.78% examples, 618000 words/s, in_qsize 6, out_qsize 0\n", - "2017-09-19 17:48:19,918 :gensim.models.word2vec :INFO : PROGRESS: at 90.91% examples, 618999 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:20,922 :gensim.models.word2vec :INFO : PROGRESS: at 91.96% examples, 619363 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:21,936 :gensim.models.word2vec :INFO : PROGRESS: at 92.85% examples, 618640 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:22,943 :gensim.models.word2vec :INFO : PROGRESS: at 93.84% examples, 618597 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:23,958 :gensim.models.word2vec :INFO : PROGRESS: at 94.85% examples, 618687 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:24,974 :gensim.models.word2vec :INFO : PROGRESS: at 95.90% examples, 618756 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:25,978 :gensim.models.word2vec :INFO : PROGRESS: at 96.93% examples, 619000 words/s, in_qsize 4, out_qsize 1\n", - "2017-09-19 17:48:26,980 :gensim.models.word2vec :INFO : PROGRESS: at 97.93% examples, 619074 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:27,982 :gensim.models.word2vec :INFO : PROGRESS: at 98.99% examples, 619431 words/s, in_qsize 5, out_qsize 0\n", - "2017-09-19 17:48:28,823 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2017-09-19 17:48:28,830 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2017-09-19 17:48:28,836 :gensim.models.word2vec :INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2017-09-19 17:48:28,837 :gensim.models.word2vec :INFO : training on 85026035 raw words (62526300 effective words) took 100.8s, 620401 effective words/s\n" + "2017-09-30 20:06:18,828 : INFO : PROGRESS: at 77.95% examples, 690734 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:19,838 : INFO : PROGRESS: at 79.07% examples, 690668 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:20,845 : INFO : PROGRESS: at 80.20% examples, 690773 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:21,848 : INFO : PROGRESS: at 81.33% examples, 690841 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:22,853 : INFO : PROGRESS: at 82.43% examples, 690748 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:23,861 : INFO : PROGRESS: at 83.55% examples, 690766 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:24,868 : INFO : PROGRESS: at 84.67% examples, 690835 words/s, in_qsize 6, out_qsize 0\n", + "2017-09-30 20:06:25,870 : INFO : PROGRESS: at 85.77% examples, 690917 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:26,876 : INFO : PROGRESS: at 86.88% examples, 690986 words/s, in_qsize 4, out_qsize 1\n", + "2017-09-30 20:06:27,879 : INFO : PROGRESS: at 88.00% examples, 691091 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:28,896 : INFO : PROGRESS: at 89.12% examples, 691157 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-30 20:06:29,900 : INFO : PROGRESS: at 90.22% examples, 691067 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:30,901 : INFO : PROGRESS: at 91.31% examples, 691025 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:31,903 : INFO : PROGRESS: at 92.42% examples, 691030 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:32,915 : INFO : PROGRESS: at 93.52% examples, 690949 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:33,927 : INFO : PROGRESS: at 94.64% examples, 690994 words/s, in_qsize 4, out_qsize 0\n", + "2017-09-30 20:06:34,932 : INFO : PROGRESS: at 95.76% examples, 690838 words/s, in_qsize 5, out_qsize 1\n", + "2017-09-30 20:06:35,949 : INFO : PROGRESS: at 96.86% examples, 690691 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:36,955 : INFO : PROGRESS: at 97.99% examples, 690835 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:37,971 : INFO : PROGRESS: at 99.13% examples, 690908 words/s, in_qsize 5, out_qsize 0\n", + "2017-09-30 20:06:38,752 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-09-30 20:06:38,756 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-09-30 20:06:38,759 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-09-30 20:06:38,760 : INFO : training on 85026035 raw words (62532240 effective words) took 90.5s, 690828 effective words/s\n" ] } ], @@ -202,32 +194,32 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "2017-09-19 17:48:45,837 :gensim.models.keyedvectors :INFO : precomputing L2-norms of word weight vectors\n" + "2017-09-30 20:08:12,509 : INFO : precomputing L2-norms of word weight vectors\n" ] }, { "data": { "text/plain": [ - "[('leaf', 0.7284336090087891),\n", - " ('trees', 0.7024068236351013),\n", - " ('bark', 0.6984879970550537),\n", - " ('fruit', 0.623538613319397),\n", - " ('flower', 0.6177238821983337),\n", - " ('nest', 0.6133654713630676),\n", - " ('garden', 0.5962027311325073),\n", - " ('avl', 0.5909914374351501),\n", - " ('cave', 0.5902420282363892),\n", - " ('pond', 0.5827507972717285)]" + "[('trees', 0.7073001861572266),\n", + " ('bark', 0.7032904028892517),\n", + " ('leaf', 0.6881209015846252),\n", + " ('bird', 0.6044381260871887),\n", + " ('flower', 0.6009336709976196),\n", + " ('fruit', 0.597153902053833),\n", + " ('avl', 0.5837888717651367),\n", + " ('cactus', 0.5712562799453735),\n", + " ('bee', 0.5658263564109802),\n", + " ('garden', 0.565678596496582)]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -245,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -253,75 +245,73 @@ "output_type": "stream", "text": [ "{\n", - " \"gensim\": {\n", - " \"model\": {\n", - " \"Google_News_word2vec\": {\n", - " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", - " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", - " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", - " },\n", - " \"fasttext_eng_model\": {\n", - " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", - " \"filename\": \"wiki.en.vec\",\n", - " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", - " },\n", - " \"glove_common_crawl_42B\": {\n", - " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.42B.300d.zip\",\n", - " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", - " },\n", - " \"glove_common_crawl_840B\": {\n", - " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.840B.300d.zip\",\n", - " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", - " },\n", - " \"glove_wiki_gigaword_300d\": {\n", - " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.300d.txt\",\n", - " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", - " },\n", - " \"glove_wiki_gigaword_200d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.200d.txt\",\n", - " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", - " },\n", - " \"glove_wiki_gigaword_100d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.100d.txt\",\n", - " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", - " },\n", - " \"glove_wiki_gigaword_50d\": {\n", - " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.6B.50d.txt\",\n", - " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", - " },\n", - " \"glove_twitter_200d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.200d.txt\",\n", - " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", - " },\n", - " \"glove_twitter_100d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.100d.txt\",\n", - " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", - " },\n", - " \"glove_twitter_50d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.50d.txt\",\n", - " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", - " },\n", - " \"glove_twitter_25d\": {\n", - " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", - " \"filename\": \"glove.twitter.27B.25d.txt\",\n", - " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", - " }\n", + " \"model\": {\n", + " \"Google_News_word2vec\": {\n", + " \"desc\": \"Google has published pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.\",\n", + " \"filename\": \"GoogleNews-vectors-negative300.bin.gz\",\n", + " \"checksum\": \"4fa963d128fe65ec8cd5dd4d9377f8ed\"\n", + " },\n", + " \"fasttext_eng_model\": {\n", + " \"desc\": \"fastText is a library for efficient learning of word representations and sentence classification.These vectors for english language in dimension 300 were obtained using the skip-gram model described in Bojanowski et al. (2016) with default parameters.\",\n", + " \"filename\": \"wiki.en.vec\",\n", + " \"checksum\": \"2de532213d7fa8b937263337c6e9deeb\"\n", + " },\n", + " \"glove_common_crawl_42B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.42B.300d.zip\",\n", + " \"checksum\": \"d6f41a6e9e5bf905d349a01b5216826a\"\n", " },\n", - " \"corpus\": {\n", - " \"text8\": {\n", - " \"desc\": \"Wikipedia English corpus\",\n", - " \"filename\": \"text8\",\n", - " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", - " }\n", + " \"glove_common_crawl_840B\": {\n", + " \"desc\": \"This model is trained on Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.840B.300d.zip\",\n", + " \"checksum\": \"72f02c239743c750eaea8747839e4852\"\n", + " },\n", + " \"glove_wiki_gigaword_300d\": {\n", + " \"desc\": \" This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 300d vectors).GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.300d.txt\",\n", + " \"checksum\": \"e0c1af43ab57753d11da2fa642c3ff82\"\n", + " },\n", + " \"glove_wiki_gigaword_200d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.200d.txt\",\n", + " \"checksum\": \"c4e58068e16be476b115699f94fa82cb\"\n", + " },\n", + " \"glove_wiki_gigaword_100d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.100d.txt\",\n", + " \"checksum\": \"7067a76b2adc0e92a1f71e2919382c95\"\n", + " },\n", + " \"glove_wiki_gigaword_50d\": {\n", + " \"desc\": \"This model is trained on Wikipedia 2014 + Gigaword 56B tokens, 400K vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.6B.50d.txt\",\n", + " \"checksum\": \"44d71eb1db9485d9c8a605a5ed560d8c\"\n", + " },\n", + " \"glove_twitter_200d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.200d.txt\",\n", + " \"checksum\": \"91b40581d04e2ff5306d2f0452e34f72\"\n", + " },\n", + " \"glove_twitter_100d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 100d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.100d.txt\",\n", + " \"checksum\": \"2825c182e4ac2afd8d2dede8445919ab\"\n", + " },\n", + " \"glove_twitter_50d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 50d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.50d.txt\",\n", + " \"checksum\": \"9842275a894ebdfb60b270877bb8f60c\"\n", + " },\n", + " \"glove_twitter_25d\": {\n", + " \"desc\": \"This model is trained on twitter(2B tweets, 27B tokens, 1.2M vocab, uncased, 25d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\",\n", + " \"filename\": \"glove.twitter.27B.25d.txt\",\n", + " \"checksum\": \"9802ffec313d8612bf790d1aa4d37ddd\"\n", + " }\n", + " },\n", + " \"corpus\": {\n", + " \"text8\": {\n", + " \"desc\": \"Wikipedia English corpus\",\n", + " \"filename\": \"text8\",\n", + " \"checksum\": \"5d703f1842fb1ca55bf86f2e2552012c\"\n", " }\n", " }\n", "}\n" @@ -330,8 +320,8 @@ ], "source": [ "import json\n", - "dataset_list = api.info()\n", - "print(json.dumps(dataset_list, indent=4))" + "data_list = api.info()\n", + "print(json.dumps(data_list, indent=4))" ] }, { @@ -343,20 +333,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2017-09-19 21:56:42,071 :gensim.api :INFO : This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. \n", - "\n" + "This model is trained on Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors). GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.\n" ] } ], "source": [ - "api.info('glove_common_crawl_42B')" + "glove_common_crawl_info = api.info('glove_common_crawl_42B')\n", + "print(glove_common_crawl_info)" ] }, { @@ -368,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "collapsed": true }, diff --git a/gensim/downloader.py b/gensim/downloader.py index 8edd3062b6..61fb294f00 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -19,27 +19,47 @@ user_dir = os.path.expanduser('~') base_dir = os.path.join(user_dir, 'gensim-data') -data_log_file_dir = os.path.join(base_dir, 'data.json') - -logging.basicConfig( - format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', - stream=sys.stdout, level=logging.INFO) +data_log_file_path = os.path.join(base_dir, 'data.json') logger = logging.getLogger('gensim.api') -if not os.path.isdir(base_dir): - try: - logger.info("Creating %s", base_dir) - os.makedirs(base_dir) - except OSError as e: - if e.errno == errno.EEXIST: - raise Exception( - "Not able to create folder gensim-data in {}. File gensim-data " - "exists in the direcory already.".format(user_dir)) - else: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the folder manually" - .format(base_dir)) +def get_data_list(): + """Function getting the list of all datasets/models. + + Returns: + list: returns a list of datasets/models avalible for installation. + """ + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list_with_filename.json" + response = urlopen(url) + data = response.read().decode("utf-8") + data = json.loads(data) + data_names = [] + corpora = data['corpus'] + models = data['model'] + for corpus in corpora: + data_names.append(corpus) + for model in models: + data_names.append(model) + return data_names + + +def get_data_name(data_): + """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. + + Args: + data_(string): Name of the corpus/model. + + Returns: + data_: returns the name for dataset/model + """ + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/alternate_names.json" + response = urlopen(url) + alternate_names_json = response.read().decode("utf-8") + alternate_names_json = json.loads(alternate_names_json) + data_names = get_data_list() + for data_name in data_names: + alternate_data_names = alternate_names_json[data_name] + if data_ in alternate_data_names: + return data_name def initialize_data_log_file(): @@ -47,8 +67,8 @@ def initialize_data_log_file(): for each corpus/model and stores in the log file. For eg: {"name": "text8", "status" : "None"} """ data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] + corpora = data['corpus'] + models = data['model'] json_list = [] for corpus in corpora: json_object = {"name": corpus, "status": "None"} @@ -56,37 +76,72 @@ def initialize_data_log_file(): for model in models: json_object = {"name": model, "status": "None"} json_list.append(json_object) - json.dump(json_list, data_log_file) - data_log_file.close() + + with open(data_log_file_path,'w') as f: + f.write(json.dumps(json_list)) + + +def create_files(): + """Function for creating the directory for storing corpora and models, and to create a json log file. + """ + if not os.path.isdir(base_dir): + try: + logger.info("Creating %s", base_dir) + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir)) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir)) + + if not os.path.isfile(data_log_file_path): + try: + logger.warning("Creating %s", data_log_file_path) + with open(data_log_file_path, 'w+'): + pass + initialize_data_log_file() + except: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the file manually" + .format(data_log_file_path)) -def update_data_log_file(dataset, status): - """Function for updating the status of the dataset json object. +def update_data_log_file(data_, status): + """Function for updating the status of the data_ json object. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. status(string): Status to be updates to i.e downloaded or installed. """ - jdata = json.loads(open(data_log_file_dir).read()) - for json_object in jdata: - if json_object["name"] == dataset: - json_object["status"] = status - with open(data_log_file_dir, 'w') as f: + with open(data_log_file_path, 'r') as f: + jdata = json.load(f) + for json_object in jdata: + if json_object["name"] == data_: + json_object["status"] = status + with open(data_log_file_path, 'w+') as f: f.write(json.dumps(jdata)) -def get_data_status(dataset): - """Function for finding the status of the dataset. + +def get_data_status(data_): + """Function for finding the status of the data_. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: returns the current status of the corpus/model i.e None, downloaded or installed. """ - jdata = json.loads(open(data_log_file_dir).read()) + with open(data_log_file_path, 'r') as f: + jdata = json.load(f) for json_object in jdata: - if json_object["name"] == dataset: + if json_object["name"] == data_: return json_object["status"] @@ -108,27 +163,30 @@ def calculate_md5_checksum(folder_dir): return hash_md5.hexdigest() -def info(dataset=None): - """Function for retrieving the list of corpora/models, if dataset is not provided. If dataset - is provided, then it gives detailed information about the dataset. +def info(data_=None): + """Function for retrieving the list of corpora/models, if data name is not provided. If data name + is provided, then it gives detailed information about the data. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: : It returns the models/corpora names with detailed information about each. """ - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/list_with_filename.json" + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list_with_filename.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if dataset is not None: - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpora: - logger.info("%s \n", data['gensim']['corpus'][dataset]["desc"]) - elif dataset in models: - logger.info("%s \n", data['gensim']['model'][dataset]["desc"]) + if data_ is not None: + data_ = get_data_name(data_) + corpora = data['corpus'] + models = data['model'] + if data_ in corpora: + logger.info("%s \n", data['corpus'][data_]["desc"]) + return data['corpus'][data_]["desc"] + elif data_ in models: + logger.info("%s \n", data['model'][data_]["desc"]) + return data['model'][data_]["desc"] else: raise Exception( "Incorrect model/corpus name. Choose the model/corpus from the list " @@ -137,55 +195,44 @@ def info(dataset=None): return data -def get_checksum(dataset): +def get_checksum(data_): """Function for retrieving the checksum of a corpus/model Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: It returns the checksum for corresponding the corpus/model. """ data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpora: - return data['gensim']['corpus'][dataset]["checksum"] - elif dataset in models: - return data['gensim']['model'][dataset]["checksum"] - - -if not os.path.isfile(data_log_file_dir): - try: - logger.warning("Creating %s", data_log_file_dir) - data_log_file = open(data_log_file_dir, 'a') - initialize_data_log_file() - except: - raise Exception( - "Can't create {}. Make sure you have the read/write permissions " - "to the directory or you can try creating the file manually" - .format(data_log_file_dir)) + corpora = data['corpus'] + models = data['model'] + if data_ in corpora: + return data['corpus'][data_]["checksum"] + elif data_ in models: + return data['model'][data_]["checksum"] + -def _download(dataset): - """Function for downloading and installed dataset depending upon it's current status. +def _download(data_): + """Function for downloading and installed corpus/model depending upon it's current status. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. """ - url = "https://github.com/chaitaliSaini/Corpus_and_models/releases/download/{f}/{f}.tar.gz".format(f=dataset) - data_folder_dir = os.path.join(base_dir, dataset) + url = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.tar.gz".format(f=data_) + data_folder_dir = os.path.join(base_dir, data_) data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset not in corpora and dataset not in models: + corpora = data['corpus'] + models = data['model'] + if data_ not in corpora and data_ not in models: raise Exception( "Incorect Model/corpus name. Use info() or" " python -m gensim.downloader -c to get a list of models/corpora" " available.") - compressed_folder_name = "{f}.tar.gz".format(f=dataset) + compressed_folder_name = "{f}.tar.gz".format(f=data_) compressed_folder_dir = os.path.join(base_dir, compressed_folder_name) - if get_data_status(dataset) != "downloaded": + if get_data_status(data_) != "downloaded": if not os.path.exists(data_folder_dir): logger.info("Creating %s", data_folder_dir) os.makedirs(data_folder_dir) @@ -196,103 +243,106 @@ def _download(dataset): "Not able to create {a}. Make sure you have the correct read/" "write permissions for {b} or you can try creating it manually". format(a=data_folder_dir, b=base_dir)) - logger.info("Downloading %s", dataset) + logger.info("Downloading %s", data_) urllib.urlretrieve(url, compressed_folder_dir) - data_url = data_links(dataset) + data_url = data_links(data_) if data_url is not None: index = data_url.rfind("/") data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", dataset) - update_data_log_file(dataset, status="downloaded") - if get_data_status(dataset) != "installed": + logger.info("%s downloaded", data_) + update_data_log_file(data_, status="downloaded") + if get_data_status(data_) != "installed": tar = tarfile.open(compressed_folder_dir) logger.info("Extracting files from %s", data_folder_dir) tar.extractall(data_folder_dir) tar.close() - if calculate_md5_checksum(data_folder_dir) == get_checksum(dataset): - update_data_log_file(dataset, status="installed") - logger.info("%s installed", dataset) + if calculate_md5_checksum(data_folder_dir) == get_checksum(data_): + update_data_log_file(data_, status="installed") + logger.info("%s installed", data_) else: logger.error("There was a problem in installing the file. Retrying.") - _download(dataset) + _download(data_) -def get_filename(dataset): +def get_filename(data_): """Function of retrieving the filename of corpus/model. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: Returns the filename of the model/corpus. """ data = info() - corpora = data['gensim']['corpus'] - models = data['gensim']['model'] - if dataset in corpora: - return data['gensim']['corpus'][dataset]["filename"] - elif dataset in models: - return data['gensim']['model'][dataset]["filename"] + corpora = data['corpus'] + models = data['model'] + if data_ in corpora: + return data['corpus'][data_]["filename"] + elif data_ in models: + return data['model'][data_]["filename"] -def load(dataset, return_path=False): +def load(data_, return_path=False): """Loads the corpus/model to the memory, if return_path is False. Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. return_path(bool): Determines whether to return model/corpus file path. Returns: string: Returns the path to the model/corpus, if return_path is True. """ - file_name = get_filename(dataset) + data_ = get_data_name(data_) + create_files() + file_name = get_filename(data_) if file_name is None: raise Exception( "Incorrect model/corpus name. Choose the model/corpus from the list " "\n {}".format(json.dumps(info(), indent=4))) - folder_dir = os.path.join(base_dir, dataset) + folder_dir = os.path.join(base_dir, data_) file_dir = os.path.join(folder_dir, file_name) - if not os.path.exists(folder_dir) or get_data_status(dataset) != "installed": - _download(dataset) + if not os.path.exists(folder_dir) or get_data_status(data_) != "installed": + _download(data_) if return_path: return file_dir else: sys.path.insert(0, base_dir) - module = __import__(dataset) + module = __import__(data_) data = module.load_data() return data -def data_links(dataset): +def data_links(data_): """Function for retrieving the links of the models/corpus which are not stored in github releases Args: - dataset(string): Name of the corpus/model. + data_(string): Name of the corpus/model. Returns: string: Returns the link of the model/corpus. """ - url = "https://raw.githubusercontent.com/chaitaliSaini/Corpus_and_models/master/links.json" + url = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/links.json" response = urlopen(url) data = response.read().decode("utf-8") data = json.loads(data) - if dataset in data['data_links']: - return data['data_links'][dataset]['link'] + if data_ in data['data_links']: + return data['data_links'][data_]['link'] if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d dataset_name | -i dataset_name | -c]") + logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', stream=sys.stdout, level=logging.INFO) + parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") group = parser.add_mutually_exclusive_group() - group.add_argument("-d", "--download", metavar="dataset_name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") - group.add_argument("-i", "--info", metavar="dataset_name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") + group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name") + group.add_argument("-i", "--info", metavar="data__name", nargs=1, help="To get information about a corpus/model : python -m gensim -i model/corpus name") group.add_argument("-c", "--catalogue", help="To get the list of all models/corpus stored : python -m gensim -c", action="store_true") args = parser.parse_args() if args.download is not None: data_path = load(args.download[0], return_path=True) logger.info("Data has been installed and data path is %s", data_path) elif args.info is not None: - info(dataset=args.info[0]) + info(data_=args.info[0]) elif args.catalogue is not None: data = info() logger.info("%s\n", json.dumps(data, indent=4)) From 498b32b2f175925189154fe4937eb95ddb7d17e4 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Tue, 3 Oct 2017 20:10:36 +0530 Subject: [PATCH 14/15] corrected formatting --- gensim/downloader.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 61fb294f00..2c9297baa4 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -22,6 +22,7 @@ data_log_file_path = os.path.join(base_dir, 'data.json') logger = logging.getLogger('gensim.api') + def get_data_list(): """Function getting the list of all datasets/models. @@ -43,7 +44,7 @@ def get_data_list(): def get_data_name(data_): - """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. + """Returns a name for the dataset/model as to download a dataset/model user can alternate names too. Args: data_(string): Name of the corpus/model. @@ -77,12 +78,12 @@ def initialize_data_log_file(): json_object = {"name": model, "status": "None"} json_list.append(json_object) - with open(data_log_file_path,'w') as f: + with open(data_log_file_path, 'w') as f: f.write(json.dumps(json_list)) - + def create_files(): - """Function for creating the directory for storing corpora and models, and to create a json log file. + """Function for creating the directory for storing corpora and models, and to create a json log file. """ if not os.path.isdir(base_dir): try: @@ -119,7 +120,7 @@ def update_data_log_file(data_, status): data_(string): Name of the corpus/model. status(string): Status to be updates to i.e downloaded or installed. """ - with open(data_log_file_path, 'r') as f: + with open(data_log_file_path, 'r') as f: jdata = json.load(f) for json_object in jdata: if json_object["name"] == data_: @@ -128,8 +129,7 @@ def update_data_log_file(data_, status): f.write(json.dumps(jdata)) - -def get_data_status(data_): +def get_data_status(data_): """Function for finding the status of the data_. Args: @@ -163,7 +163,7 @@ def calculate_md5_checksum(folder_dir): return hash_md5.hexdigest() -def info(data_=None): +def info(data_=None): """Function for retrieving the list of corpora/models, if data name is not provided. If data name is provided, then it gives detailed information about the data. @@ -213,7 +213,6 @@ def get_checksum(data_): return data['model'][data_]["checksum"] - def _download(data_): """Function for downloading and installed corpus/model depending upon it's current status. @@ -293,7 +292,7 @@ def load(data_, return_path=False): Returns: string: Returns the path to the model/corpus, if return_path is True. """ - data_ = get_data_name(data_) + data_ = get_data_name(data_) create_files() file_name = get_filename(data_) if file_name is None: From 03649b00973b112e54726c7db9f59d835c394b04 Mon Sep 17 00:00:00 2001 From: chaitalisaini Date: Thu, 5 Oct 2017 16:47:00 +0530 Subject: [PATCH 15/15] added checksum after download --- gensim/downloader.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 2c9297baa4..579e93e51f 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -145,14 +145,15 @@ def get_data_status(data_): return json_object["status"] -def calculate_md5_checksum(folder_dir): - """Function for calculating checksum of a downloaded model/corpus. +def calculate_md5_checksum(folder_dir, tar_file=None): + """Function for calculating checksum of a downloaded or installed model/corpus. Args: - folder_dir(string): Path to the downloaded model. + folder_dir(string): Path to the model/corpus folder.(contains model/corpus if proxied) + tar_file(string): Path to the dowloaded tar file. Tar file contains __init__.py file and the model/corpus(if it is stored in github releases) Returns: - string: It returns the value for the checksum for folder_dir directory + string: It returns the value for the checksum for folder_dir directory and the tar file. """ hash_md5 = hashlib.md5() for filename in os.listdir(folder_dir): @@ -160,6 +161,10 @@ def calculate_md5_checksum(folder_dir): with open(file_dir, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) + if tar_file is not None: + with open(tar_file, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) return hash_md5.hexdigest() @@ -195,7 +200,7 @@ def info(data_=None): return data -def get_checksum(data_): +def get_checksum(data_, status): """Function for retrieving the checksum of a corpus/model Args: @@ -204,13 +209,14 @@ def get_checksum(data_): Returns: string: It returns the checksum for corresponding the corpus/model. """ + key = "checksum_after_" + status data = info() corpora = data['corpus'] models = data['model'] if data_ in corpora: - return data['corpus'][data_]["checksum"] + return data['corpus'][data_][key] elif data_ in models: - return data['model'][data_]["checksum"] + return data['model'][data_][key] def _download(data_): @@ -249,18 +255,23 @@ def _download(data_): index = data_url.rfind("/") data_dir = os.path.join(data_folder_dir, data_url[index + 1:]) urllib.urlretrieve(data_url, data_dir) - logger.info("%s downloaded", data_) - update_data_log_file(data_, status="downloaded") + if calculate_md5_checksum(data_folder_dir, compressed_folder_dir) == get_checksum(data_, "download"): + logger.info("%s downloaded", data_) + update_data_log_file(data_, status="downloaded") + else: + logger.error("There was a problem in downloading the data. Retrying.") + _download(data_) + if get_data_status(data_) != "installed": tar = tarfile.open(compressed_folder_dir) logger.info("Extracting files from %s", data_folder_dir) tar.extractall(data_folder_dir) tar.close() - if calculate_md5_checksum(data_folder_dir) == get_checksum(data_): + if calculate_md5_checksum(data_folder_dir) == get_checksum(data_, "installation"): update_data_log_file(data_, status="installed") logger.info("%s installed", data_) else: - logger.error("There was a problem in installing the file. Retrying.") + logger.error("There was a problem in installing the dataset/model. Retrying.") _download(data_) @@ -303,6 +314,7 @@ def load(data_, return_path=False): file_dir = os.path.join(folder_dir, file_name) if not os.path.exists(folder_dir) or get_data_status(data_) != "installed": _download(data_) + if return_path: return file_dir else: @@ -330,7 +342,7 @@ def data_links(data_): if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s : %(message)s', stream=sys.stdout, level=logging.INFO) + logging.basicConfig(format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO) parser = argparse.ArgumentParser(description="Gensim console API", usage="python -m gensim.api.downloader [-h] [-d data__name | -i data__name | -c]") group = parser.add_mutually_exclusive_group() group.add_argument("-d", "--download", metavar="data__name", nargs=1, help="To download a corpus/model : python -m gensim -d corpus/model name")