Skip to content

Commit

Permalink
Refactor documentation API Reference for gensim.summarization (#1709)
Browse files Browse the repository at this point in the history
* Added docstrings in textcleaner.py

* Added docstrings to bm25.py

* syntactic_unit.py docstrings and typo

* added doctrings for graph modules

* keywords draft

* keywords draft updated

* keywords draft updated again

* keywords edited

* pagerank started

* pagerank summarizer docstring added

* fixed types in docstrings in commons, bm25, graph and keywords

* fixed types, examples and types in docstrings

* fix pep8

* fix doc build

* fix bm25

* fix graph

* fix graph[2]

* fix commons

* fix keywords

* fix keywords[2]

* fix mz_entropy

* fix pagerank_weighted

* fix graph rst

* fix summarizer

* fix syntactic_unit

* fix textcleaner

* fix
  • Loading branch information
yurkai authored and menshikh-iv committed Dec 12, 2017
1 parent 056ec00 commit f09b7db
Show file tree
Hide file tree
Showing 13 changed files with 1,435 additions and 157 deletions.
1 change: 1 addition & 0 deletions docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Modules:
summarization/commons
summarization/graph
summarization/keywords
summarization/mz_entropy
summarization/pagerank_weighted
summarization/summariser
summarization/syntactic_unit
Expand Down
6 changes: 3 additions & 3 deletions docs/src/summarization/graph.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
:mod:`summarization.graph` -- TextRank graph
=========================================================
:mod:`summarization.graph` -- Graph
===================================

.. automodule:: gensim.summarization.graph
:synopsis: TextRank graph
:synopsis: Graph
:members:
:inherited-members:
:undoc-members:
Expand Down
9 changes: 9 additions & 0 deletions docs/src/summarization/mz_entropy.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`summarization.mz_entropy` -- Keywords for the Montemurro and Zanette entropy algorithm
============================================================================================

.. automodule:: gensim.summarization.mz_entropy
:synopsis: Keywords for the Montemurro and Zanette entropy algorithm
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
2 changes: 1 addition & 1 deletion gensim/parsing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def preprocess_documents(docs):
Returns
-------
list of (list of str)
list of list of str
Processed documents split by whitespace.
Examples
Expand Down
115 changes: 114 additions & 1 deletion gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,75 @@
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module contains function of computing rank scores for documents in
corpus and helper class `BM25` used in calculations. Original alhorithm
descibed in [1]_, also you may check Wikipedia page [2]_.
.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond,
http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf
.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25
Examples
--------
>>> from gensim.summarization.bm25 import get_bm25_weights
>>> corpus = [
... ["black", "cat", "white", "cat"],
... ["cat", "outer", "space"],
... ["wag", "dog"]
... ]
>>> result = get_bm25_weights(corpus)
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""


import math
from six import iteritems
from six.moves import xrange


# BM25 parameters.
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25


class BM25(object):
"""Implementation of Best Matching 25 ranking function.
Attributes
----------
corpus_size : int
Size of corpus (number of documents).
avgdl : float
Average length of document in `corpus`.
corpus : list of list of str
Corpus of documents.
f : list of dicts of int
Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
df : dict
Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
idf : dict
Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
"""

def __init__(self, corpus):
"""
Parameters
----------
corpus : list of list of str
Given corpus.
"""
self.corpus_size = len(corpus)
self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
self.corpus = corpus
Expand All @@ -26,6 +81,7 @@ def __init__(self, corpus):
self.initialize()

def initialize(self):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
for document in self.corpus:
frequencies = {}
for word in document:
Expand All @@ -43,6 +99,23 @@ def initialize(self):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

def get_score(self, document, index, average_idf):
"""Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
Parameters
----------
document : list of str
Document to be scored.
index : int
Index of document in corpus selected to score with `document`.
average_idf : float
Average idf in corpus.
Returns
-------
float
BM25 score.
"""
score = 0
for word in document:
if word not in self.f[index]:
Expand All @@ -53,6 +126,22 @@ def get_score(self, document, index, average_idf):
return score

def get_scores(self, document, average_idf):
"""Computes and returns BM25 scores of given `document` in relation to
every item in corpus.
Parameters
----------
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.
Returns
-------
list of float
BM25 scores.
"""
scores = []
for index in xrange(self.corpus_size):
score = self.get_score(document, index, average_idf)
Expand All @@ -61,6 +150,30 @@ def get_scores(self, document, average_idf):


def get_bm25_weights(corpus):
"""Returns BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.
Parameters
----------
corpus : list of list of str
Corpus of documents.
Returns
-------
list of list of float
BM25 scores.
Examples
--------
>>> from gensim.summarization.bm25 import get_bm25_weights
>>> corpus = [
... ["black", "cat", "white", "cat"],
... ["cat", "outer", "space"],
... ["wag", "dog"]
... ]
>>> result = get_bm25_weights(corpus)
"""
bm25 = BM25(corpus)
average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

Expand Down
44 changes: 44 additions & 0 deletions gensim/summarization/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,45 @@
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module provides functions of creating graph from sequence of values and removing of unreachable nodes.
Examples
--------
Create simple graph and add edges. Let's take a look at nodes.
>>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf'])
>>> gg.add_edge(("Felidae", "Lion"))
>>> gg.add_edge(("Felidae", "Tiger"))
>>> sorted(gg.nodes())
['Felidae', 'Lion', 'Tiger', 'Wolf']
Remove nodes with no edges.
>>> remove_unreachable_nodes(gg)
>>> sorted(gg.nodes())
['Felidae', 'Lion', 'Tiger']
"""

from gensim.summarization.graph import Graph


def build_graph(sequence):
"""Creates and returns undirected graph with given sequence of values.
Parameters
----------
sequence : list of hashable
Sequence of values.
Returns
-------
:class:`~gensim.summarization.graph.Graph`
Created graph.
"""
graph = Graph()
for item in sequence:
if not graph.has_node(item):
Expand All @@ -15,6 +50,15 @@ def build_graph(sequence):


def remove_unreachable_nodes(graph):
"""Removes unreachable nodes (nodes with no edges), inplace.
Parameters
----------
graph : :class:`~gensim.summarization.graph.Graph`
Given graph.
"""

for node in graph.nodes():
if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
graph.del_node(node)
Loading

0 comments on commit f09b7db

Please sign in to comment.