From 117d447087b2e9a36fda1279d837e045fefd1558 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Fri, 9 Feb 2018 05:48:32 +0300 Subject: [PATCH] Refactor docstrings for `gensim.scripts`. Partial fix #1665 (#1792) * drafts docstrings * docstrings added, Ivan need to check format * fixed links, headers, typos and etc * docstrings moved to scripts inside and appear as console output in docs. setup.py and conf.py need to add sphinxcontrib.programoutput as new extension * update configs with new extension sphinxcontrib.programoutput * fix doc[1] * finish with glove2word2vec (RawFormatter + examples) * more examples for glove2word2vec * simplify example for g2w2v + fix for w2v2t * fix segment_wiki * revert make_wikicorpus (should be fixed in refactoring) * revert [2] --- docs/src/conf.py | 2 +- gensim/scripts/glove2word2vec.py | 101 +++++++++++++++++++++++------- gensim/scripts/segment_wiki.py | 65 +++++++++++-------- gensim/scripts/word2vec2tensor.py | 71 ++++++++++++--------- setup.py | 2 +- 5 files changed, 159 insertions(+), 82 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index b8b108dbf6..5bef5a42b4 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -25,7 +25,7 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath'] +extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath', 'sphinxcontrib.programoutput'] autoclass_content = "both" # Add any paths that contain templates here, relative to this directory. diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 30f62c9b11..8574d6ff77 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -5,20 +5,56 @@ # Copyright (C) 2016 Manas Ranjan Kar # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -USAGE: - $ python -m gensim.scripts.glove2word2vec --input --output -Where: +"""This script allows to convert GloVe vectors into the word2vec. Both files are +presented in text format and almost identical except that word2vec includes +number of vectors and its dimension which is only difference regard to GloVe. -* : Input GloVe .txt file. -* : Desired name of output Word2vec .txt file. +Notes +----- -This script is used to convert GloVe vectors in text format into the word2vec text format. -The only difference between the two formats is an extra header line in word2vec, -which contains the number of vectors and their dimensionality (two integers). -""" +GloVe format (real example can be founded `on Stanford size `_) :: + + word1 0.123 0.134 0.532 0.152 + word2 0.934 0.412 0.532 0.159 + word3 0.334 0.241 0.324 0.188 + ... + word9 0.334 0.241 0.324 0.188 + + +Word2Vec format (real example can be founded `on w2v old repository `_) :: + + 9 4 + word1 0.123 0.134 0.532 0.152 + word2 0.934 0.412 0.532 0.159 + word3 0.334 0.241 0.324 0.188 + ... + word9 0.334 0.241 0.324 0.188 + + +How to use +---------- +>>> from gensim.test.utils import datapath, get_tmpfile +>>> from gensim.models import KeyedVectors +>>> +>>> glove_file = datapath('test_glove.txt') +>>> tmp_file = get_tmpfile("test_word2vec.txt") +>>> +>>> # call glove2word2vec script +>>> # default way (through CLI): python -m gensim.scripts.glove2word2vec --input --output +>>> from gensim.scripts.glove2word2vec import glove2word2vec +>>> glove2word2vec(glove_file, tmp_file) +>>> +>>> model = KeyedVectors.load_word2vec_format(tmp_file) + + +Command line arguments +---------------------- + +.. program-output:: python -m gensim.scripts.glove2word2vec --help + :ellipsis: 0, -5 +""" import sys import logging import argparse @@ -29,7 +65,19 @@ def get_glove_info(glove_file_name): - """Return the number of vectors and dimensions in a file in GloVe format.""" + """Get number of vectors in provided `glove_file_name` and dimension of vectors. + + Parameters + ---------- + glove_file_name : str + Path to file in GloVe format. + + Returns + ------- + (int, int) + Number of vectors (lines) of input file and its dimension. + + """ with smart_open(glove_file_name) as f: num_lines = sum(1 for _ in f) with smart_open(glove_file_name) as f: @@ -38,7 +86,21 @@ def get_glove_info(glove_file_name): def glove2word2vec(glove_input_file, word2vec_output_file): - """Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format.""" + """Convert `glove_input_file` in GloVe format to word2vec format and write it to `word2vec_output_file`. + + Parameters + ---------- + glove_input_file : str + Path to file in GloVe format. + word2vec_output_file: str + Path to output file. + + Returns + ------- + (int, int) + Number of vectors (lines) of input file and its dimension. + + """ num_lines, num_dims = get_glove_info(glove_input_file) logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) with smart_open(word2vec_output_file, 'wb') as fout: @@ -50,17 +112,12 @@ def glove2word2vec(glove_input_file, word2vec_output_file): if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.root.setLevel(level=logging.INFO) - logger.info("running %s", ' '.join(sys.argv)) - - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", required=True, help="Input file, in gloVe format (read-only).") - parser.add_argument( - "-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten)." - ) + logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO) + parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("-i", "--input", required=True, help="Path to input file in GloVe format") + parser.add_argument("-o", "--output", required=True, help="Path to output file") args = parser.parse_args() - # do the actual conversion + logger.info("running %s", ' '.join(sys.argv)) num_lines, num_dims = glove2word2vec(args.input, args.output) logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims) diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index 98b99e91d8..9a9c19b43f 100644 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -4,39 +4,51 @@ # Author: Jayant Jain # Copyright (C) 2016 RaRe Technologies -""" -CLI script for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided by MediaWiki \ -that looks like wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2 \ +"""This script using for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided +by MediaWiki that looks like wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2 (e.g. 14 GB of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2). -It streams through all the XML articles using multiple cores (#cores - 1, by default), \ +It streams through all the XML articles using multiple cores (#cores - 1, by default), decompressing on the fly and extracting plain text from the articles and their sections. For each extracted article, it prints its title, section names and plain text section contents, in json-line format. -Examples --------- +How to use +---------- +#. Process Wikipedia dump with this script :: + + python -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz + +#. Read output in simple way + + >>> from smart_open import smart_open + >>> import json + >>> + >>> # iterate over the plain text data we just created + >>> for line in smart_open('enwiki-latest.json.gz'): + >>> # decode each JSON line into a Python dictionary object + >>> article = json.loads(line) + >>> + >>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and "section_texts". + >>> print("Article title: %s" % article['title']) + >>> print("Interlinks: %s" + article['interlinks']) + >>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): + >>> print("Section title: %s" % section_title) + >>> print("Section text: %s" % section_text) + + +Notes +----- +Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour, +or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz. - python -m gensim.scripts.segment_wiki -h - python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz +Command line arguments +---------------------- -Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour, \ -or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz. +.. program-output:: python -m gensim.scripts.segment_wiki --help + :ellipsis: 0, -10 -You can then read the created output (~6.1 GB gzipped) with: - ->>> # iterate over the plain text data we just created ->>> for line in smart_open('enwiki-latest.json.gz'): ->>> # decode each JSON line into a Python dictionary object ->>> article = json.loads(line) ->>> ->>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and "section_texts". ->>> print("Article title: %s" % article['title']) ->>> print("Interlinks: %s" + article['interlinks']) ->>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): ->>> print("Section title: %s" % section_title) ->>> print("Section text: %s" % section_text) """ import argparse @@ -338,10 +350,8 @@ def get_texts_with_sections(self): if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s : %(processName)s : %(levelname)s : %(message)s', level=logging.INFO) - logger.info("running %s", " ".join(sys.argv)) - - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__']) + logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO) + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__[:-136]) default_workers = max(1, multiprocessing.cpu_count() - 1) parser.add_argument('-f', '--file', help='Path to MediaWiki database dump (read-only).', required=True) parser.add_argument( @@ -367,6 +377,7 @@ def get_texts_with_sections(self): ) args = parser.parse_args() + logger.info("running %s", " ".join(sys.argv)) segment_and_write_all_articles( args.file, args.output, min_article_character=args.min_article_character, diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index c445253b26..2618bdcae0 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -5,32 +5,37 @@ # Copyright (C) 2016 Silvio Olivastri # Copyright (C) 2016 Radim Rehurek -""" -USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output \ - [--binary] -Where: +"""This script allows converting word-vectors from word2vec format into Tensorflow 2D tensor and metadata format. +This script used for for word-vector visualization on `Embedding Visualization `_. -* : Input Word2Vec model. -* : 2D tensor TSV output file name prefix. -* : Set True if Word2Vec model is binary. Defaults to False. -Output: - The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will - use the --output file name as prefix. +How to use +---------- +#. Convert your word-vector with this script (for example, we'll use model from + `gensim-data `_) :: -This script is used to convert the word2vec format to Tensorflow 2D tensor -and metadata formats for Embedding Visualization. -To use the generated TSV 2D tensor and metadata file in the Projector Visualizer, please + python -m gensim.downloader -d glove-wiki-gigaword-50 # download model in word2vec format + python -m gensim.scripts.word2vec2tensor -i ~/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz \ + -o /tmp/my_model_prefix -1) Open http://projector.tensorflow.org/. -2) Choose "Load Data" from the left menu. -3) Select "Choose file" in "Load a TSV file of vectors." and choose you local "_tensor.tsv" file. -4) Select "Choose file" in "Load a TSV file of metadata." and choose you local "_metadata.tsv" file. +#. Open http://projector.tensorflow.org/ +#. Click "Load Data" button from the left menu. +#. Select "Choose file" in "Load a TSV file of vectors." and choose "/tmp/my_model_prefix_tensor.tsv" file. +#. Select "Choose file" in "Load a TSV file of metadata." and choose "/tmp/my_model_prefix_metadata.tsv" file. +#. ??? +#. PROFIT! For more information about TensorBoard TSV format please visit: https://www.tensorflow.org/versions/master/how_tos/embedding_viz/ + +Command line arguments +---------------------- + +.. program-output:: python -m gensim.scripts.word2vec2tensor --help + :ellipsis: 0, -7 + """ import os @@ -44,12 +49,18 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): - """Convert Word2Vec mode to 2D tensor TSV file and metadata file + """Convert file in Word2Vec format and writes two files 2D tensor TSV file. + + File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words. - Args: - word2vec_model_path (str): word2vec model file path. - tensor_filename (str): filename prefix. - binary (bool): set True to use a binary Word2Vec model, defaults to False. + Parameters + ---------- + word2vec_model_path : str + Path to file in Word2Vec format. + tensor_filename : str + Prefix for output files. + binary : bool, optional + True if input file in binary format. """ model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary) @@ -68,18 +79,16 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.root.setLevel(level=logging.INFO) - logger.info("running %s", ' '.join(sys.argv)) - - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", required=True, help="Input word2vec model") - parser.add_argument("-o", "--output", required=True, help="Output tensor file name prefix") + logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO) + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__[:-138]) + parser.add_argument("-i", "--input", required=True, help="Path to input file in word2vec format") + parser.add_argument("-o", "--output", required=True, help="Prefix path for output files") parser.add_argument( - "-b", "--binary", required=False, help="If word2vec model in binary format, set True, else False" + "-b", "--binary", action='store_const', const=True, default=False, + help="Set this flag if word2vec model in binary format (default: %(default)s)" ) args = parser.parse_args() + logger.info("running %s", ' '.join(sys.argv)) word2vec2tensor(args.input, args.output, args.binary) - logger.info("finished running %s", os.path.basename(sys.argv[0])) diff --git a/setup.py b/setup.py index 30d6b3f554..c5fa9ba84a 100644 --- a/setup.py +++ b/setup.py @@ -307,7 +307,7 @@ def finalize_options(self): 'distributed': distributed_env, 'test-win': win_testenv, 'test': linux_testenv, - 'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly', 'pattern'], + 'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly', 'pattern', 'sphinxcontrib.programoutput'], }, include_package_data=True,