Refactor documentation API Reference for gensim.summarization (#1709)

* Added docstrings in textcleaner.py * Added docstrings to bm25.py * syntactic_unit.py docstrings and typo * added doctrings for graph modules * keywords draft * keywords draft updated * keywords draft updated again * keywords edited * pagerank started * pagerank summarizer docstring added * fixed types in docstrings in commons, bm25, graph and keywords * fixed types, examples and types in docstrings * fix pep8 * fix doc build * fix bm25 * fix graph * fix graph[2] * fix commons * fix keywords * fix keywords[2] * fix mz_entropy * fix pagerank_weighted * fix graph rst * fix summarizer * fix syntactic_unit * fix textcleaner * fix
piskvorky · Dec 12, 2017 · f09b7db · f09b7db
1 parent 056ec00
commit f09b7db
Show file tree

Hide file tree

Showing 13 changed files with 1,435 additions and 157 deletions.
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -88,6 +88,7 @@ Modules:
     summarization/commons
     summarization/graph
     summarization/keywords
+    summarization/mz_entropy
     summarization/pagerank_weighted
     summarization/summariser
     summarization/syntactic_unit

diff --git a/docs/src/summarization/graph.rst b/docs/src/summarization/graph.rst
@@ -1,8 +1,8 @@
-:mod:`summarization.graph` -- TextRank graph
-=========================================================
+:mod:`summarization.graph` -- Graph
+===================================
 
 .. automodule:: gensim.summarization.graph
-    :synopsis: TextRank graph
+    :synopsis: Graph
     :members:
     :inherited-members:
     :undoc-members:

diff --git a/docs/src/summarization/mz_entropy.rst b/docs/src/summarization/mz_entropy.rst
@@ -0,0 +1,9 @@
+:mod:`summarization.mz_entropy` -- Keywords for the Montemurro and Zanette entropy algorithm
+============================================================================================
+
+.. automodule:: gensim.summarization.mz_entropy
+    :synopsis: Keywords for the Montemurro and Zanette entropy algorithm
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
@@ -363,7 +363,7 @@ def preprocess_documents(docs):
 
     Returns
     -------
-    list of (list of str)
+    list of list of str
         Processed documents split by whitespace.
 
     Examples

diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
@@ -3,20 +3,75 @@
 #
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
+"""This module contains function of computing rank scores for documents in
+corpus and helper class `BM25` used in calculations. Original alhorithm
+descibed in [1]_, also you may check Wikipedia page [2]_.
+
+
+.. [1] Robertson, Stephen; Zaragoza, Hugo (2009).  The Probabilistic Relevance Framework: BM25 and Beyond,
+       http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf
+.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25
+
+
+
+Examples
+--------
+>>> from gensim.summarization.bm25 import get_bm25_weights
+>>> corpus = [
+...     ["black", "cat", "white", "cat"],
+...     ["cat", "outer", "space"],
+...     ["wag", "dog"]
+... ]
+>>> result = get_bm25_weights(corpus)
+
+
+Data:
+-----
+.. data:: PARAM_K1 - Free smoothing parameter for BM25.
+.. data:: PARAM_B - Free smoothing parameter for BM25.
+.. data:: EPSILON - Constant used for negative idf of document in corpus.
+
+"""
+
+
 import math
 from six import iteritems
 from six.moves import xrange
 
 
-# BM25 parameters.
 PARAM_K1 = 1.5
 PARAM_B = 0.75
 EPSILON = 0.25
 
 
 class BM25(object):
+    """Implementation of Best Matching 25 ranking function.
+
+    Attributes
+    ----------
+    corpus_size : int
+        Size of corpus (number of documents).
+    avgdl : float
+        Average length of document in `corpus`.
+    corpus : list of list of str
+        Corpus of documents.
+    f : list of dicts of int
+        Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
+    df : dict
+        Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
+    idf : dict
+        Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
+
+    """
 
     def __init__(self, corpus):
+        """
+        Parameters
+        ----------
+        corpus : list of list of str
+            Given corpus.
+
+        """
         self.corpus_size = len(corpus)
         self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
         self.corpus = corpus
@@ -26,6 +81,7 @@ def __init__(self, corpus):
         self.initialize()
 
     def initialize(self):
+        """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
         for document in self.corpus:
             frequencies = {}
             for word in document:
@@ -43,6 +99,23 @@ def initialize(self):
             self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
 
     def get_score(self, document, index, average_idf):
+        """Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
+
+        Parameters
+        ----------
+        document : list of str
+            Document to be scored.
+        index : int
+            Index of document in corpus selected to score with `document`.
+        average_idf : float
+            Average idf in corpus.
+
+        Returns
+        -------
+        float
+            BM25 score.
+
+        """
         score = 0
         for word in document:
             if word not in self.f[index]:
@@ -53,6 +126,22 @@ def get_score(self, document, index, average_idf):
         return score
 
     def get_scores(self, document, average_idf):
+        """Computes and returns BM25 scores of given `document` in relation to
+        every item in corpus.
+
+        Parameters
+        ----------
+        document : list of str
+            Document to be scored.
+        average_idf : float
+            Average idf in corpus.
+
+        Returns
+        -------
+        list of float
+            BM25 scores.
+
+        """
         scores = []
         for index in xrange(self.corpus_size):
             score = self.get_score(document, index, average_idf)
@@ -61,6 +150,30 @@ def get_scores(self, document, average_idf):
 
 
 def get_bm25_weights(corpus):
+    """Returns BM25 scores (weights) of documents in corpus.
+    Each document has to be weighted with every document in given corpus.
+
+    Parameters
+    ----------
+    corpus : list of list of str
+        Corpus of documents.
+
+    Returns
+    -------
+    list of list of float
+        BM25 scores.
+
+    Examples
+    --------
+    >>> from gensim.summarization.bm25 import get_bm25_weights
+    >>> corpus = [
+    ...     ["black", "cat", "white", "cat"],
+    ...     ["cat", "outer", "space"],
+    ...     ["wag", "dog"]
+    ... ]
+    >>> result = get_bm25_weights(corpus)
+
+    """
     bm25 = BM25(corpus)
     average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
 

diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py
@@ -3,10 +3,45 @@
 #
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
+"""This module provides functions of creating graph from sequence of values and removing of unreachable nodes.
+
+
+Examples
+--------
+
+Create simple graph and add edges. Let's take a look at nodes.
+
+>>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf'])
+>>> gg.add_edge(("Felidae", "Lion"))
+>>> gg.add_edge(("Felidae", "Tiger"))
+>>> sorted(gg.nodes())
+['Felidae', 'Lion', 'Tiger', 'Wolf']
+
+Remove nodes with no edges.
+
+>>> remove_unreachable_nodes(gg)
+>>> sorted(gg.nodes())
+['Felidae', 'Lion', 'Tiger']
+
+"""
+
 from gensim.summarization.graph import Graph
 
 
 def build_graph(sequence):
+    """Creates and returns undirected graph with given sequence of values.
+
+    Parameters
+    ----------
+    sequence : list of hashable
+        Sequence of values.
+
+    Returns
+    -------
+    :class:`~gensim.summarization.graph.Graph`
+        Created graph.
+
+    """
     graph = Graph()
     for item in sequence:
         if not graph.has_node(item):
@@ -15,6 +50,15 @@ def build_graph(sequence):
 
 
 def remove_unreachable_nodes(graph):
+    """Removes unreachable nodes (nodes with no edges), inplace.
+
+    Parameters
+    ----------
+    graph : :class:`~gensim.summarization.graph.Graph`
+        Given graph.
+
+    """
+
     for node in graph.nodes():
         if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
             graph.del_node(node)