Refactor documentation for gensim.similarities.docsim and `MmCorpus…

…-related`. (#1910) * Added example for text_corpus.py * Fix for example * Updated docstrings for docsim.py * Beta_docstrings for docsim.py * Gamma_docstrings for docsim.py * Massive package of different files. * fix build (PEP8, rst) * retranslate _mmreader.pyx with cython==0.27.3 * fix matutils * fix textcorpus * fix mmcorpus * fix mmreader[2] * fix docsim[1] * fix docsim[2] * fix docsim[3] * fix docsim[4] * fix docsim[5]
piskvorky · Feb 23, 2018 · 5355c06 · 5355c06
1 parent c3f08c1
commit 5355c06
Show file tree

Hide file tree

Showing 8 changed files with 1,211 additions and 809 deletions.
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -20,6 +20,7 @@ Modules:
     corpora/lowcorpus
     corpora/malletcorpus
     corpora/mmcorpus
+    corpora/_mmreader
     corpora/sharded_corpus
     corpora/svmlightcorpus
     corpora/textcorpus

diff --git a/docs/src/corpora/_mmreader.rst b/docs/src/corpora/_mmreader.rst
@@ -0,0 +1,9 @@
+:mod:`corpora._mmreader` -- Reader for corpus in the Matrix Market format.
+==========================================================================
+
+.. automodule:: gensim.corpora._mmreader
+    :synopsis: Reader for corpus in the Matrix Market format.
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/corpora/_mmreader.c b/gensim/corpora/_mmreader.c
diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx
@@ -1,8 +1,6 @@
 # Copyright (C) 2018 Radim Rehurek <radimrehurek@seznam.cz>
-"""
-Reader for corpus in the Matrix Market format.
-
-"""
+# cython: embedsignature=True
+"""Reader for corpus in the Matrix Market format."""
 from __future__ import with_statement
 
 from gensim import utils
@@ -19,20 +17,19 @@ logger = logging.getLogger(__name__)
 
 
 cdef class MmReader(object):
-    """
-    matrix market file reader
+    """Matrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.
 
     Wrap a term-document matrix on disk (in matrix-market format), and present it
     as an object which supports iteration over the rows (~documents).
 
     Attributes
     ----------
     num_docs : int
-        number of documents in market matrix file
+        Number of documents in market matrix file.
     num_terms : int
-        number of terms
+        Number of terms.
     num_nnz : int
-        number of non-zero terms
+        Number of non-zero terms.
 
     Notes
     ----------
@@ -47,20 +44,15 @@ cdef class MmReader(object):
 
     def __init__(self, input, transposed=True):
         """
-        MmReader(input, transposed=True):
-
-        Create matrix reader
 
         Parameters
         ----------
-        input : string or file-like
-            string (file path) or a file-like object that supports
-            `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are
-            not closed automatically.
+        input : {str, file-like object}
+            Path to input file in MM format or a file-like object that supports `seek()`
+            (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).
 
-        transposed : bool
-            if True, expects lines to represent doc_id, term_id, value
-            else, expects term_id, doc_id, value
+        transposed : bool, optional
+            if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.
 
         """
         logger.info("initializing cython corpus reader from %s", input)
@@ -91,22 +83,20 @@ cdef class MmReader(object):
         )
 
     def __len__(self):
+        """Get size of corpus (number of documents)."""
         return self.num_docs
 
     def __str__(self):
         return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
                 (self.num_docs, self.num_terms, self.num_nnz))
 
     def skip_headers(self, input_file):
-        """
-        skip_headers(self, input_file)
-
-        Skip file headers that appear before the first document.
+        """Skip file headers that appear before the first document.
 
         Parameters
         ----------
-        input_file : iterable
-            consumes any lines from start of `input_file` that begin with a %
+        input_file : iterable of str
+            Iterable taken from file in MM format.
 
         """
         for line in input_file:
@@ -115,23 +105,18 @@ cdef class MmReader(object):
             break
 
     def __iter__(self):
-        """
-        __iter__()
-
-        Iterate through vectors from underlying matrix
+        """Iterate through corpus.
 
-        Yields
+        Notes
         ------
-        int, list of (termid, val)
-            document id and "vector" of terms for next document in matrix
-            vector of terms is represented as a list of (termid, val) tuples
+        Note that the total number of vectors returned is always equal to the number of rows specified
+        in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly
+        stored in the Matrix Market file.
 
-        Notes
+        Yields
         ------
-        Note that the total number of vectors returned is always equal to the
-        number of rows specified in the header; empty documents are inserted and
-        yielded where appropriate, even if they are not explicitly stored in the
-        Matrix Market file.
+        (int, list of (int, number))
+            Document id and Document in BoW format
 
         """
         cdef int docid, termid, previd
@@ -180,21 +165,17 @@ cdef class MmReader(object):
             yield previd, []
 
     def docbyoffset(self, offset):
-        """
-        docbyoffset(offset)
-
-        Return document at file offset `offset` (in bytes)
+        """Get document at file offset `offset` (in bytes).
 
         Parameters
         ----------
         offset : int
-            offset, in bytes, of desired document
+            Offset, in bytes, of desired document.
 
         Returns
         ------
-        list of (termid, val)
-            "vector" of terms for document at offset
-            vector of terms is represented as a list of (termid, val) tuples
+        list of (int, str)
+            Document in BoW format.
 
         """
         # empty documents are not stored explicitly in MM format, so the index marks

diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py
@@ -5,10 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Corpus in the Matrix Market format.
-"""
-
+"""Corpus in the Matrix Market format."""
 
 import logging
 
@@ -20,39 +17,45 @@
 
 
 class MmCorpus(matutils.MmReader, IndexedCorpus):
-    """
-    Corpus in matrix market format
+    """Corpus in matrix market format.
 
     Wrap a term-document matrix on disk (in matrix-market format), and present it
     as an object which supports iteration over the rows (~documents).
 
     Attributes
     ----------
     num_docs : int
-        number of documents in market matrix file
+        Number of documents in market matrix file.
     num_terms : int
-        number of terms
+        Number of terms.
     num_nnz : int
-        number of non-zero terms
+        Number of non-zero terms.
 
     Notes
     ----------
-    Note that the file is read into memory one document at a time, not the whole
-    matrix at once (unlike scipy.io.mmread). This allows us to process corpora
-    which are larger than the available RAM.
+    Note that the file is read into memory one document at a time, not the whole matrix at once
+    (unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM.
+
+    Example
+    --------
+    >>> from gensim.corpora.mmcorpus import MmCorpus
+    >>> from gensim.test.utils import datapath
+    >>> import gensim.downloader as api
+    >>>
+    >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
+    >>> for document in corpus:
+    ...     pass
 
     """
 
     def __init__(self, fname):
         """
-        Read corpus in matrix market format
 
         Parameters
         ----------
-        fname : string or file-like
-            string (file path) or a file-like object that supports
-            `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). File-like objects are
-            not closed automatically.
+        fname : {str, file-like object}
+            Path to file in MM format or a file-like object that supports `seek()`
+            (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`).
 
         """
 
@@ -61,34 +64,56 @@ def __init__(self, fname):
         matutils.MmReader.__init__(self, fname)
 
     def __iter__(self):
-        """
-        Iterate through vectors from underlying matrix
+        """Iterate through document.
 
         Yields
         ------
-        list of (termid, val)
-            "vector" of terms for next document in matrix
-            vector of terms is represented as a list of (termid, val) tuples
+        list of (int, str)
+            Document in BoW format.
 
         Notes
         ------
-        Note that the total number of vectors returned is always equal to the
-        number of rows specified in the header; empty documents are inserted and
-        yielded where appropriate, even if they are not explicitly stored in the
+        The total number of vectors returned is always equal to the number of rows specified in the header.
+        Empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the
         Matrix Market file.
 
         """
-
         for doc_id, doc in super(MmCorpus, self).__iter__():
             yield doc  # get rid of doc id, return the sparse vector only
 
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
-        """
-        Save a corpus in the Matrix Market format to disk.
+        """Save a corpus in the Matrix Market format to disk.
 
+        Parameters
+        ----------
+        fname : str
+            Path to file.
+        corpus : iterable of list of (int, number)
+            Corpus in Bow format.
+        id2word : dict of (int, str), optional
+            WordId -> Word.
+        progress_cnt : int, optional
+            Progress counter.
+        metadata : bool, optional
+            If true, writes out additional metadata.
+
+        Notes
+        -----
         This function is automatically called by `MmCorpus.serialize`; don't
         call it directly, call `serialize` instead.
+
+        Example
+        -------
+        >>> from gensim.corpora.mmcorpus import MmCorpus
+        >>> from gensim.test.utils import datapath
+        >>> import gensim.downloader as api
+        >>>
+        >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm'))
+        >>>
+        >>> MmCorpus.save_corpus("random", corpus) # Do not do it, use `serialize` instead.
+        [97, 121, 169, 201, 225, 249, 258, 276, 303]
+
         """
         logger.info("storing corpus in Matrix Market format to %s", fname)
         num_terms = len(id2word) if id2word is not None else None

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
@@ -216,13 +216,26 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
 
         Examples
         --------
-        >>> #TODO Example with inheritance
         >>> from gensim.corpora.textcorpus import TextCorpus
         >>> from gensim.test.utils import datapath
+        >>> from gensim import utils
         >>>
-        >>> corpus = TextCorpus(datapath('head500.noblanks.cor.bz2'))
-        >>> for bow in corpus:
-        ...     pass
+        >>>
+        >>> class CorpusMiislita(TextCorpus):
+        ...     stopwords = set('for a of the and to in on'.split())
+        ...
+        ...     def get_texts(self):
+        ...         for doc in self.getstream():
+        ...             yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords]
+        ...
+        ...     def __len__(self):
+        ...         self.length = sum(1 for _ in self.get_texts())
+        ...         return self.length
+        >>>
+        >>> corpus = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
+        >>> len(corpus)
+        250
+        >>> document = next(iter(corpus.get_texts()))
 
         """
         self.input = input