From 1c6009cb077dce8fa806f2397f6291980bcff5af Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Sun, 12 Nov 2017 15:49:36 +0300 Subject: [PATCH 01/27] Added docstrings in textcleaner.py --- gensim/summarization/textcleaner.py | 179 +++++++++++++++++++++++++++- 1 file changed, 175 insertions(+), 4 deletions(-) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 5f33bbcea9..5c4bc363ef 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -3,6 +3,13 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Text Cleaner + +This module contains functions and processors used for processing text, +extracting sentences from text, working with acronyms and abbreviations. +""" + + from gensim.summarization.syntactic_unit import SyntacticUnit from gensim.parsing.preprocessing import preprocess_documents from gensim.utils import tokenize @@ -22,28 +29,102 @@ SEPARATOR = r'@' +"""str: special separator used in abbreviations.""" RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +"""SRE_Pattern: pattern to split text to sentences.""" AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) +"""SRE_Pattern: pattern for detecting abbreviations. (Example: Sgt. Pepper)""" AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) +"""SRE_Pattern: one more pattern for detecting acronyms.""" AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE) +"""SRE_Pattern: one more pattern for detecting acronyms. +(Example: P.S. I love you)""" UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) +"""SRE_Pattern: Pattern like AB_SENIOR but with SEPARATOR between abbreviation +and next word""" UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) +"""SRE_Pattern: Pattern like AB_ACRONYM but with SEPARATOR between abbreviation +and next word""" def split_sentences(text): + """Splits and returns list of sentences from given text. It preserves + abbreviations set in `AB_SENIOR` and `AB_ACRONYM`. + + Parameters + ---------- + text : str + Input text. + + Returns + ------- + str: + List of sentences from text. + """ processed = replace_abbreviations(text) return [undo_replacement(sentence) for sentence in get_sentences(processed)] def replace_abbreviations(text): + """Replaces blank space to @ separator after abbreviation and next word. + + Parameters + ---------- + sentence : str + Input sentence. + + Returns + ------- + str: + Sentence with changed separator. + + Example + ------- + >>> replace_abbreviations("God bless you, please, Mrs. Robinson") + God bless you, please, Mrs.@Robinson + """ return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) def undo_replacement(sentence): + """Replaces `@` separator back to blank space after each abbreviation. + + Parameters + ---------- + sentence : str + Input sentence. + + Returns + ------- + str: + Sentence with changed separator. + + Example + ------- + >>> undo_replacement("God bless you, please, Mrs.@Robinson") + God bless you, please, Mrs. Robinson + """ return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) def replace_with_separator(text, separator, regexs): + """Returns text with replaced separator if provided regular expressions + were matched. + + Parameters + ---------- + text : str + Input text. + separator : str + The separator between words to be replaced. + regexs : str + List of regular expressions. + + Returns + ------- + str + Text with replaced separators. + """ replacement = r"\1" + separator + r"\2" result = text for regex in regexs: @@ -52,11 +133,49 @@ def replace_with_separator(text, separator, regexs): def get_sentences(text): + """Sentence generator from provided text. Sentence pattern set in `RE_SENTENCE`. + + Parameters + ---------- + text : str + Input text. + + Yields + ------ + str + Single sentence extracted from text. + + Example + ------- + >>> text = "Does this text contains two sentences? Yes, it is." + >>> for sentence in get_sentences(text): + >>> print(sentence) + Does this text contains two sentences? + Yes, it is. + """ for match in RE_SENTENCE.finditer(text): yield match.group() def merge_syntactic_units(original_units, filtered_units, tags=None): + """Processes given sentences and its filtered (tokenized) copies into + SyntacticUnit type. Also adds tags if they are provided to produced units. + Returns a SyntacticUnit list. + + Parameters + ---------- + original_units : list + List of original sintences. + filtered_units : list + List of tokenized sintences. + tags : list + List of strings used as tags for each unit. None as deafault. + + Returns + ------- + list + SyntacticUnit for each input item. + """ units = [] for i in xrange(len(original_units)): if filtered_units[i] == '': @@ -74,12 +193,37 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): def join_words(words, separator=" "): + """Merges words to a string using separator (blank space as default). + + Parameters + ---------- + words : list + List of words. + separator : str + The separator bertween elements. Blank set as default. + + Returns + ------- + str + String of merged words with separator between them. + """ return separator.join(words) def clean_text_by_sentences(text): - """ Tokenizes a given text into sentences, applying filters and lemmatizing them. - Returns a SyntacticUnit list. """ + """Tokenizes a given text into sentences, applying filters and lemmatizing them. + Returns a SyntacticUnit list. + + Parameters + ---------- + text : list + Input text. + + Returns + ------- + list + SyntacticUnit objects for each sentence. + """ original_sentences = split_sentences(text) filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] @@ -87,8 +231,21 @@ def clean_text_by_sentences(text): def clean_text_by_word(text, deacc=True): - """ Tokenizes a given text into words, applying filters and lemmatizing them. - Returns a dict of word -> syntacticUnit. """ + """Tokenizes a given text into words, applying filters and lemmatizing them. + Returns a dictionary of word -> syntacticUnit. + + Parameters + ---------- + text : list + Input text. + deacc : bool + Remove accentuation (default True). + + Returns + ------- + dictionary + Word as key, SyntacticUnit as value of dictionary. + """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] @@ -101,5 +258,19 @@ def clean_text_by_word(text, deacc=True): def tokenize_by_word(text): + """Tokenizes input text. Before tokenizing transforms text to lower case and + removes accentuation and acronyms set `AB_ACRONYM_LETTERS`. + Returns generator of words. + + Parameters + ---------- + text : list + Input text. + + Returns + ------- + generator + Words contained in processed text. + """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) return tokenize(text_without_acronyms, to_lower=True, deacc=True) From 5cbb1848112b66344b617259c6501dfa0653013b Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Tue, 14 Nov 2017 01:38:35 +0300 Subject: [PATCH 02/27] Added docstrings to bm25.py --- gensim/summarization/bm25.py | 109 ++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 1fb11a8d77..608acf7a6a 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -3,20 +3,72 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains function of computing BM25 scores for documents in +corpus and helper class `BM25` used in calculations. + + +Example: +-------- +>>> import numpy as np +>>> from gensim.summarization.bm25 import get_bm25_weights +>>> corpus = [ +>>> ["black", "cat", "white", "cat"], +>>> ["cat", "outer", "space"], +>>> ["wag", "dog"] +>>> ] +>>> np.round(get_bm25_weights(corpus), 3) +array([[ 1.282, 0.182, 0. ], + [ 0.13 , 1.113, 0. ], + [ 0. , 0. , 1.022]]) + +Data: +----- +.. data:: PARAM_K1 - free smoothing parameter for BM25. +.. data:: PARAM_B - free smoothing parameter for BM25. +.. data:: EPSILON - constant used for negative idf of document in corpus. +""" + + import math from six import iteritems from six.moves import xrange -# BM25 parameters. PARAM_K1 = 1.5 PARAM_B = 0.75 EPSILON = 0.25 class BM25(object): + """Implementation of Best Matching 25 ranking function. + + Attributes + ---------- + corpus_size : int + Size of corpus (number of documents). + avgdl : float + Average length of document in `corpus`. + corpus : list of (list of str) + Corpus of documents. + f : list of dict + Terms frequencies for each document in `corpus`. + df : dict + Terms frequencies for whole `corpus`. + idf : dict + Inverse document frequency. + + """ + def __init__(self, corpus): + """Presets atributes and runs initialize() function. + + Parameters + ---------- + corpus : list of (list of str) + Corpus of documents. + + """ self.corpus_size = len(corpus) self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size self.corpus = corpus @@ -25,7 +77,12 @@ def __init__(self, corpus): self.idf = {} self.initialize() + def initialize(self): + """Calculates frequncies of terms in documents and in corpus. Also + computes inverse document frequncies. + + """ for document in self.corpus: frequencies = {} for word in document: @@ -42,7 +99,26 @@ def initialize(self): for word, freq in iteritems(self.df): self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) + def get_score(self, document, index, average_idf): + """Computes BM25 score of given `document` in relation to item of corpus + selected by `index`. + + Parameters + ---------- + document : list of str + Document to be scored. + index : integer + Index of document in corpus selected to score with `document`. + average_idf : float + Average idf in corpus. + + Returns + ------- + float + BM25 score. + + """ score = 0 for word in document: if word not in self.f[index]: @@ -52,7 +128,24 @@ def get_score(self, document, index, average_idf): / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl))) return score + def get_scores(self, document, average_idf): + """Computes and returns BM25 scores of given `document` in relation to + every item in corpus. + + Parameters + ---------- + document : list of str + Document to be scored. + average_idf : float + Average idf in corpus. + + Returns + ------- + list of float + BM25 scores. + + """ scores = [] for index in xrange(self.corpus_size): score = self.get_score(document, index, average_idf) @@ -61,6 +154,20 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): + """Returns BM25 scores (weights) of documents in corpus. Each document + has to be weighted with every document in given corpus. + + Parameters + ---------- + corpus : list of (list of str) + Corpus of documents. + + Returns + ------- + list of (list of float) + BM25 scores. + + """ bm25 = BM25(corpus) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) From 31be0959f56a5dfc155fa94d3338f71720d25c87 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Tue, 14 Nov 2017 11:42:37 +0300 Subject: [PATCH 03/27] syntactic_unit.py docstrings and typo --- gensim/summarization/syntactic_unit.py | 39 ++++++++++++++++++++++++++ gensim/summarization/textcleaner.py | 4 +-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 89842e1122..492f141ad6 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -3,10 +3,49 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains implementation of SyntacticUnit class. It generally used +while text cleaning. SyntacticUnit represents printable version of provided +text. + + +Example +------- +>>> print(SyntacticUnit("Beautiful is better than ugly.", "beauti better ugli")) +Original unit: 'Beautiful is better than ugly.' *-*-*-* Processed unit: 'beauti better ugli' + +""" class SyntacticUnit(object): + """SyntacticUnit class. + + Attributes + ---------- + text : str + Input text. + token : str + Tokenized text. + tag : str + Tag of unit, optional. + index : int + Index of sytactic unit in corpus, optional. + score : float + Score (BM25) of synctatic unit, optional. + + """ def __init__(self, text, token=None, tag=None): + """Initializates syntactic unit. + + Parameters + ---------- + text : str + Input text. + token : str + Tokenized text, optional. + tag : str + Tag of unit, optional. + + """ self.text = text self.token = token self.tag = tag[:2] if tag else None # Just first two letters of tag diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 5c4bc363ef..7ae7b6bc30 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -165,9 +165,9 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): Parameters ---------- original_units : list - List of original sintences. + List of original sentences. filtered_units : list - List of tokenized sintences. + List of tokenized sentences. tags : list List of strings used as tags for each unit. None as deafault. From c6c608b0d510d382a3d3aa09095a67ae91c4e86d Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Thu, 16 Nov 2017 19:43:04 +0300 Subject: [PATCH 04/27] added doctrings for graph modules --- gensim/summarization/commons.py | 45 ++++ gensim/summarization/graph.py | 381 +++++++++++++++++++++++++++----- 2 files changed, 366 insertions(+), 60 deletions(-) diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 1c467098f9..4e93461018 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -3,10 +3,46 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module provides functions of creatinf graph from sequence of values and +removing of unreachable nodes. + + +Examples +-------- + +Create simple graph and add edges. Let's kake a look at nodes. + +>>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf']) +>>> gg.add_edge(("Felidae", "Lion")) +>>> gg.add_edge(("Felidae", "Tiger")) +>>> gg.nodes() +['Felidae', 'Lion', 'Tiger', 'Wolf'] + +Remove nodes with no edges. + +>>> remove_unreachable_nodes(gg) +>>> gg.nodes() +['Felidae', 'Lion', 'Tiger'] + +""" + from gensim.summarization.graph import Graph def build_graph(sequence): + """Creates and returns graph with given sequence of values. + + Parameters + ---------- + sequence : list + Sequence of values. + + Returns + ------- + Graph + Created graph. + + """ graph = Graph() for item in sequence: if not graph.has_node(item): @@ -15,6 +51,15 @@ def build_graph(sequence): def remove_unreachable_nodes(graph): + """Removes unreachable nodes (nodes with no edges). Works inplace. + + Parameters + ---------- + graph : Graph + Given graph. + + """ + for node in graph.nodes(): if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: graph.del_node(node) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index c35a59a25d..a1b4ea0540 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -3,11 +3,41 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains abstract class IGraph represents graphs interface and +class Graph (based on IGraph) which implements undirected graph. + +Examples +-------- + +Create simple graph with 4 nodes. + +>>> g = Graph() +>>> g.add_node('Felidae') +>>> g.add_node('Lion') +>>> g.add_node('Tiger') +>>> g.add_node('Wolf') +>>> g.nodes() +['Felidae', 'Lion', 'Tiger', 'Wolf'] + +Add some edges. Let's check neighbours. + +>>> g.add_edge(("Felidae", "Lion")) +>>> g.add_edge(("Felidae", "Tiger")) +>>> g.neighbors("Felidae") +['Lion', 'Tiger'] + +One node has no neighbours. + +>>> g.neighbors("Wolf") +[] + +""" + from abc import ABCMeta, abstractmethod class IGraph(object): - """ Represents the interface or contract that the graph for TextRank + """Represents the interface or contract that the graph for TextRank should implement. """ __metaclass__ = ABCMeta @@ -15,20 +45,26 @@ class IGraph(object): @abstractmethod def nodes(self): """ - Return node list. + Returns all nodes of graph. + + Returns + ------- + list of node + Nodes of graph. - @rtype: list - @return: Node list. """ pass @abstractmethod def edges(self): """ - Return all edges in the graph. + Returns all edges of graph. + + Returns + ------- + list of (tuple of node) + Edges of graph. - @rtype: list - @return: List of all edges in the graph. """ pass @@ -37,107 +73,137 @@ def neighbors(self, node): """ Return all nodes that are directly accessible from given node. - @type node: node - @param node: Node identifier + Parameters + ---------- + node : str or float + Given node identifier. + + Returns + ------- + list of node + Nodes directly accessible from given `node`. - @rtype: list - @return: List of nodes directly accessible from given node. """ pass @abstractmethod def has_node(self, node): - """ - Return whether the requested node exists. + """Returns whether the requested node exists. + + Parameters + ---------- + node : str or float + Given node identifier. - @type node: node - @param node: Node identifier + Returns + ------- + bool + True if `node` exists, False otherwise. - @rtype: boolean - @return: Truth-value for node existence. """ pass @abstractmethod def add_node(self, node, attrs=None): - """ - Add given node to the graph. - - @attention: While nodes can be of any type, it's strongly recommended + """Adds given node to the graph. + + Note + ---- + While nodes can be of any type, it's strongly recommended to use only numbers and single-line strings as node identifiers if you intend to use write(). - @type node: node - @param node: Node identifier. + Parameters + ---------- + node : float or str + Given node + attrs : list + Node attributes specified as (attribute, value) - @type attrs: list - @param attrs: List of node attributes specified as (attribute, value) - tuples. """ pass @abstractmethod def add_edge(self, edge, wt=1, label='', attrs=None): - """ - Add an edge to the graph connecting two nodes. - - An edge, here, is a pair of nodes like C{(n, m)}. + """Adds an edge to the graph connecting two nodes. An edge, here, + is a tuple of two nodes. + + Parameters + ---------- + edge : tuple of node + Given edge. + wt : float + Weight of new edge. + label : str + Edge label. + attrs : list + Node attributes specified as (attribute, value) - @type edge: tuple - @param edge: Edge. - - @type wt: number - @param wt: Edge weight. - - @type label: string - @param label: Edge label. - - @type attrs: list - @param attrs: List of node attributes specified as (attribute, value) - tuples. """ pass @abstractmethod def has_edge(self, edge): - """ - Return whether an edge exists. + """Returns whether an edge exists. + + Parameters + ---------- + edge : tuple of node + Given edge. An edge, here, is a tuple of two nodes. - @type edge: tuple - @param edge: Edge. + Returns + ------- + bool + True if `edge` exists, False otherwise. - @rtype: boolean - @return: Truth-value for edge existence. """ pass @abstractmethod def edge_weight(self, edge): - """ - Get the weight of an edge. + """Returns weigth of given edge. + + Parameters + ---------- + edge : tuple of node + Given edge. - @type edge: edge - @param edge: One edge. + Returns + ------- + float + Edge weight. - @rtype: number - @return: Edge weight. """ pass @abstractmethod def del_node(self, node): - """ - Remove a node from the graph. + """Removes node and its edges from graph. - @type node: node - @param node: Node identifier. + Parameters + ---------- + node : float or str + Given node. + """ pass class Graph(IGraph): """ - Implementation of an undirected graph, based on Pygraph + Implementation of an undirected graph, based on IGraph. + + Attributes + ---------- + WEIGHT_ATTRIBUTE_NAME : str + Name of weight attribute in graph. + DEFAULT_WEIGHT : float + Weight set by default. + LABEL_ATTRIBUTE_NAME : str + Name of attribute. + DEFAULT_LABEL : str + Label set by default. + """ WEIGHT_ATTRIBUTE_NAME = "weight" @@ -160,19 +226,91 @@ def __init__(self): self.node_neighbors = {} def has_edge(self, edge): + """Returns whether an edge exists. + + Parameters + ---------- + edge : tuple of node + Given edge. An edge, here, is a tuple of two nodes. + + Returns + ------- + bool + True if `edge` exists, False otherwise. + + """ u, v = edge return (u, v) in self.edge_properties and (v, u) in self.edge_properties def edge_weight(self, edge): + """Returns weigth of given edge. + + Parameters + ---------- + edge : tuple of node + Given edge. + + Returns + ------- + float + Edge weight. + + """ return self.get_edge_properties(edge).setdefault(self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT) def neighbors(self, node): + """Returns all nodes that are directly accessible from given node. + + Parameters + ---------- + node : float or str + Given node identifier. + + Returns + ------- + list of node + Nodes directly accessible from given `node`. + + """ return self.node_neighbors[node] def has_node(self, node): + """Returns whether the requested node exists. + + Parameters + ---------- + node : float or str + Given node identifier. + + Returns + ------- + bool + True if `node` exists, False otherwise. + + """ return node in self.node_neighbors def add_edge(self, edge, wt=1, label='', attrs=None): + """Adds an edge to the graph connecting two nodes. An edge, here, + is a tuple of two nodes. + + Parameters + ---------- + edge : tuple of node + Given edge. + wt : float + Weight of new edge. + label : str + Edge label. + attrs : list + Node attributes specified as (attribute, value). + + Raises + ------ + ValueError + If `edge` already exists in graph. + + """ if attrs is None: attrs = [] u, v = edge @@ -187,6 +325,27 @@ def add_edge(self, edge, wt=1, label='', attrs=None): raise ValueError("Edge (%s, %s) already in graph" % (u, v)) def add_node(self, node, attrs=None): + """Adds given node to the graph. + + Note + ---- + While nodes can be of any type, it's strongly recommended + to use only numbers and single-line strings as node identifiers if you + intend to use write(). + + Parameters + ---------- + node : float or str + Given node. + attrs : list + Node attributes specified as (attribute, value) + + Raises + ------ + ValueError + If `node` already exists in graph. + + """ if attrs is None: attrs = [] if node not in self.node_neighbors: @@ -197,43 +356,137 @@ def add_node(self, node, attrs=None): def nodes(self): return list(self.node_neighbors.keys()) + """Returns all nodes of graph. + + Returns + ------- + list of node + Nodes of graph. + + """ def edges(self): + """Returns all edges of graph. + + Returns + ------- + list of edges (tuple of node) + Edges of graph. + + """ return [a for a in self.edge_properties.keys()] def del_node(self, node): + """Removes given node and its edges from graph. + + Parameters + ---------- + node : float or str + Given node. + + """ for each in list(self.neighbors(node)): if each != node: self.del_edge((each, node)) del self.node_neighbors[node] del self.node_attr[node] - # Helper methods def get_edge_properties(self, edge): + """Returns properties of given given edge. If edge doesn't exist + empty dictionary will be returned. + + Parameters + ---------- + edge : tuple of node + Given edge. + + Returns + ------- + dict + Properties of graph. + + """ return self.edge_properties.setdefault(edge, {}) def add_edge_attributes(self, edge, attrs): + """Adds attributes `attrs`to given edge. Order of nodes in edge doesn't + matter. + + Parameters + ---------- + edge : tuple of node + Given edge. + attrs : list + Provided attributes to add. + + """ for attr in attrs: self.add_edge_attribute(edge, attr) def add_edge_attribute(self, edge, attr): + """Adds attribute `attr`to given edge. Order of nodes in edge doesn't + matter. + + Parameters + ---------- + edge : tuple of node + Given edge. + + attr : object + Provided attribute to add. + + """ self.edge_attr[edge] = self.edge_attributes(edge) + [attr] if edge[0] != edge[1]: self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] def edge_attributes(self, edge): + """Returns attributes of given edge. In case of non existing edge + returns empty list. + + Parameters + ---------- + edge : tuple of node + Given edge. + + Returns + ------- + list + Attributes of given edge. + + """ try: return self.edge_attr[edge] except KeyError: return [] def set_edge_properties(self, edge, **properties): + """Adds `properties` to given edge. Order of nodes in edge doesn't + matter. + + Parameters + ---------- + edge : tuple of node + Given edge. + + properties : dictionary + Properties to add. + + """ self.edge_properties.setdefault(edge, {}).update(properties) if edge[0] != edge[1]: self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties) def del_edge(self, edge): + """Removes given edges from graph. + + Parameters + ---------- + edge : tuple of node + Given edge. + + """ u, v = edge self.node_neighbors[u].remove(v) self.del_edge_labeling((u, v)) @@ -242,6 +495,14 @@ def del_edge(self, edge): self.del_edge_labeling((v, u)) def del_edge_labeling(self, edge): + """Removes attributes and properties if given edge. + + Parameters + ---------- + edge : tuple of node + Given edge. + + """ keys = [edge, edge[::-1]] for key in keys: From d5247c19ccd2808628a3ba66ee9d0f5b34a7efcb Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Fri, 17 Nov 2017 17:11:17 +0300 Subject: [PATCH 05/27] keywords draft --- gensim/summarization/commons.py | 4 +-- gensim/summarization/keywords.py | 44 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 4e93461018..7fff08c770 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -3,14 +3,14 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module provides functions of creatinf graph from sequence of values and +"""This module provides functions of creating graph from sequence of values and removing of unreachable nodes. Examples -------- -Create simple graph and add edges. Let's kake a look at nodes. +Create simple graph and add edges. Let's take a look at nodes. >>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf']) >>> gg.add_edge(("Felidae", "Lion")) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 1630c9389d..758783fd33 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -199,6 +199,36 @@ def _format_results(_keywords, combined_keywords, split, scores): def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): + """. + + Parameters + ---------- + text : str + Sequence of values. + ratio : float + If no "words" option is selected, the number of sentences is + reduced by the provided ratio, else, the ratio is ignored. + words : list + . + split : bool + . + scores : bool + . + pos_filter : tuple + Part of speech filters. + lemmatize : bool + Lemmatize words, optional. + deacc : bool + Remove accentuation, optional. + + Returns + ------- + Graph + Created graph. + + """ + + # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) @@ -233,6 +263,20 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= def get_graph(text): + """Creates and returns graph with given text. Cleans, tokenizes text + before creating a graph. + + Parameters + ---------- + text : str + Sequence of values. + + Returns + ------- + Graph + Created graph. + + """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) From 3031cd067bb6f6c0ebd50ec56be03b1a0f2f19b5 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Mon, 20 Nov 2017 19:39:50 +0300 Subject: [PATCH 06/27] keywords draft updated --- gensim/summarization/keywords.py | 205 +++++++++++++++++++++++++++++-- 1 file changed, 197 insertions(+), 8 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 758783fd33..abc1feb24b 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -27,10 +27,39 @@ def _get_pos_filters(): + """Returns default including and excluding filters as frozen sets. + + Returns + ------- + tuple of frozenset + Including and excluding filters. + + """ return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) def _get_words_for_graph(tokens, pos_filter=None): + """Filters given dictionary of tokens using provided part of speech filters + and returns appropriate list of words. + + Parameters + ---------- + tokens : dictionary + Input text. + pos_filter : tuple + Part of speech filters, optional. + + Returns + ------- + list + Filtered words. + + Raises + ------ + ValueError + If include and exclude filters ar not empty at the same time. + + """ if pos_filter is None: include_filters, exclude_filters = _get_pos_filters() else: @@ -49,10 +78,38 @@ def _get_words_for_graph(tokens, pos_filter=None): def _get_first_window(split_text): + """Returns first `WINDOW_SIZE` tokens from given splitted text. + + Parameters + ---------- + split_text : list + Given splitted text. + + Returns + ------- + tuple of frozenset + Including and excluding filters. + + """ return split_text[:WINDOW_SIZE] def _set_graph_edge(graph, tokens, word_a, word_b): + """Sets an edge between nodes word_a and word_b if they exists in `tokens` + and `graph`. Works inplace. + + Parameters + ---------- + graph : Graph + Given graph. + tokens : Graph + Given tokens. + word_a : str + First word. + word_b : str + Second word. + + """ if word_a in tokens and word_b in tokens: lemma_a = tokens[word_a].token lemma_b = tokens[word_b].token @@ -63,12 +120,38 @@ def _set_graph_edge(graph, tokens, word_a, word_b): def _process_first_window(graph, tokens, split_text): + """Sets an edges between nodes taken from first `WINDOW_SIZE` words + of `split_text` if they exist in `tokens` and `graph`. Works inplace. + + Parameters + ---------- + graph : Graph + Given graph. + tokens : Graph + Given tokens. + split_text : list of str + First word. + + """ first_window = _get_first_window(split_text) for word_a, word_b in _combinations(first_window, 2): _set_graph_edge(graph, tokens, word_a, word_b) def _init_queue(split_text): + """Initializies queue by first words from `split_text`. + + Parameters + ---------- + split_text : list of str + Splitted text. + + Returns + ------- + Queue + Initialized queue. + + """ queue = _Queue() first_window = _get_first_window(split_text) for word in first_window[1:]: @@ -77,17 +160,54 @@ def _init_queue(split_text): def _process_word(graph, tokens, queue, word): + """Sets edge between `word` and each element in queue in `graph` if such nodes + exist in `tokens` and `graph`. + + Parameters + ---------- + graph : Graph + Given graph. + tokens : Graph + Given tokens. + queue : Queue + Given queue. + word : str + Word, possible `node` in graph and item in `tokens`. + + """ for word_to_compare in _queue_iterator(queue): _set_graph_edge(graph, tokens, word, word_to_compare) def _update_queue(queue, word): + """Updates given `queue` (removes last item and puts `word`). + + Parameters + ---------- + queue : Queue + Given queue. + word : str + Word to be added to queue. + """ queue.get() queue.put(word) assert queue.qsize() == (WINDOW_SIZE - 1) def _process_text(graph, tokens, split_text): + """Processes `split_text` by updating given `graph` with new eges between + nodes if they exists in `tokens` and `graph`. Words are taken from + `split_text` with window size `WINDOW_SIZE`. + + Parameters + ---------- + graph : Graph + Given graph. + tokens : Graph + Given tokens. + split_text : list of str + Splitted text. + """ queue = _init_queue(split_text) for i in xrange(WINDOW_SIZE, len(split_text)): word = split_text[i] @@ -96,6 +216,19 @@ def _process_text(graph, tokens, split_text): def _queue_iterator(queue): + """Represents iterator of the given queue. + + Parameters + ---------- + queue : Queue + Given queue. + + Yields + ------ + str + Current item of queue. + + """ iterations = queue.qsize() for _ in xrange(iterations): var = queue.get() @@ -104,20 +237,64 @@ def _queue_iterator(queue): def _set_graph_edges(graph, tokens, split_text): + """Updates given `graph` by setting eges between nodes if they exists in + `tokens` and `graph`. Words are taken from `split_text` with window size + `WINDOW_SIZE`. + + Parameters + ---------- + graph : Graph + Given graph. + tokens : dict + Given tokens. + split_text : list of str + Splitted text. + """ _process_first_window(graph, tokens, split_text) _process_text(graph, tokens, split_text) def _extract_tokens(lemmas, scores, ratio, words): - lemmas.sort(key=lambda s: scores[s], reverse=True) + """Extracts tokens from provided lemmas. Most scored lemmas are used if + `words` not provided. + + Parameters + ---------- + lemmas : list + Given lemmas. + scores : dict + Dictionary with lemmas and its scores. + ratio : float + Proportion of `lemmas` used for final result. + words : int + Number of used words. If no "words" option is selected, the number of + sentences is reduced by the provided ratio, else, the ratio is ignored. + + Returns + ------- + list of (tuple of float and str) + Scores and corresponded lemmas. - # If no "words" option is selected, the number of sentences is - # reduced by the provided ratio, else, the ratio is ignored. + """ + lemmas.sort(key=lambda s: scores[s], reverse=True) length = len(lemmas) * ratio if words is None else words return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))] def _lemmas_to_words(tokens): + """Extracts words and lemmas from given tokens. + + Parameters + ---------- + tokens : dict + Given tokens. + + Returns + ------- + dict + Keys are lemmas and values are corresponding words. + + """ lemma_to_word = {} for word, unit in iteritems(tokens): lemma = unit.token @@ -129,11 +306,23 @@ def _lemmas_to_words(tokens): def _get_keywords_with_score(extracted_lemmas, lemma_to_word): + """Returns lemmas and its scores from `extracted_lemmas` contained in + `lemma_to_word`. + + Parameters + ---------- + extracted_lemmas : list of tuples + Given lemmas. + lemma_to_word : dict of {lemma:list of words} + . + + Returns + ------- + dict + Keywords as keys and scores as values. + """ - :param extracted_lemmas:list of tuples - :param lemma_to_word: dict of {lemma:list of words} - :return: dict of {keyword:score} - """ + keywords = {} for score, lemma in extracted_lemmas: keyword_list = lemma_to_word[lemma] @@ -208,7 +397,7 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= ratio : float If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. - words : list + words : int . split : bool . From 4d7b0a9ff8ccc45de27d7734bea1cd94555ecf91 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Tue, 21 Nov 2017 19:49:14 +0300 Subject: [PATCH 07/27] keywords draft updated again --- gensim/summarization/keywords.py | 68 ++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index abc1feb24b..c5761d5a4b 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -306,7 +306,7 @@ def _lemmas_to_words(tokens): def _get_keywords_with_score(extracted_lemmas, lemma_to_word): - """Returns lemmas and its scores from `extracted_lemmas` contained in + """Returns words of `extracted_lemmas` and its scores. Words contains in `lemma_to_word`. Parameters @@ -314,7 +314,7 @@ def _get_keywords_with_score(extracted_lemmas, lemma_to_word): extracted_lemmas : list of tuples Given lemmas. lemma_to_word : dict of {lemma:list of words} - . + Lemmas and corresponding words. Returns ------- @@ -332,15 +332,37 @@ def _get_keywords_with_score(extracted_lemmas, lemma_to_word): def _strip_word(word): + """Return cleaned `word`. + + Parameters + ---------- + word : str + Given word. + + Returns + ------- + str + Cleaned word. + """ stripped_word_list = list(_tokenize_by_word(word)) return stripped_word_list[0] if stripped_word_list else "" def _get_combined_keywords(_keywords, split_text): """ - :param keywords:dict of keywords:scores - :param split_text: list of strings - :return: combined_keywords:list + + Parameters + ---------- + _keywords : dict {keywords:scores} + Keywords and its scores. + split_text : list of str + Splitted text. + + Returns + ------- + list + . + """ result = [] _keywords = _keywords.copy() @@ -364,6 +386,20 @@ def _get_combined_keywords(_keywords, split_text): def _get_average_score(concept, _keywords): + """Returns average score of words in `concept`. + + Parameters + ---------- + text : str + Input text. + _keywords : dict {keywords:scores} + Keywords and its scores. + + Returns + ------- + float + Average score. + """ word_list = concept.split() word_counter = 0 total = 0 @@ -374,9 +410,25 @@ def _get_average_score(concept, _keywords): def _format_results(_keywords, combined_keywords, split, scores): - """ - :param keywords:dict of keywords:scores - :param combined_keywords:list of word/s + """Formats, sorts and returns combined_keywords in desired format. + + Parameters + ---------- + _keywords : dict {keywords:scores} + Keywords and its scores. + combined_keywords : list of str + ?. + split : bool + Whether split result or return string, optional. + scores : bool + Whether return `combined_keywords` with scores, optional. If True + `split` is ignored. + + Returns + ------- + str or list of str or list of (tuple of str) + Formated `combined_keywords`. + """ combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) if scores: From 2c8ef28b185107fb22d2c0c5a6d6bd608ed22025 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Wed, 22 Nov 2017 23:16:25 +0300 Subject: [PATCH 08/27] keywords edited --- gensim/summarization/keywords.py | 53 ++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index c5761d5a4b..867311f53f 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -349,7 +349,8 @@ def _strip_word(word): def _get_combined_keywords(_keywords, split_text): - """ + """Returns most scored words (`_keywords`) contained in `split_text` and its + combinations. Parameters ---------- @@ -360,8 +361,8 @@ def _get_combined_keywords(_keywords, split_text): Returns ------- - list - . + list of str + Keywords and/or its combinations. """ result = [] @@ -390,7 +391,7 @@ def _get_average_score(concept, _keywords): Parameters ---------- - text : str + concept : str Input text. _keywords : dict {keywords:scores} Keywords and its scores. @@ -410,14 +411,14 @@ def _get_average_score(concept, _keywords): def _format_results(_keywords, combined_keywords, split, scores): - """Formats, sorts and returns combined_keywords in desired format. + """Formats, sorts and returns `combined_keywords` in desired format. Parameters ---------- _keywords : dict {keywords:scores} Keywords and its scores. combined_keywords : list of str - ?. + Most ranked words and/or its combinations. split : bool Whether split result or return string, optional. scores : bool @@ -440,7 +441,7 @@ def _format_results(_keywords, combined_keywords, split, scores): def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): - """. + """Returns most ranked word of provided text and/or its combinations . Parameters ---------- @@ -450,11 +451,11 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int - . + Number of returned words. split : bool - . + Whether split keywords, optional. scores : bool - . + Whether score of keyword, optional. pos_filter : tuple Part of speech filters. lemmatize : bool @@ -464,8 +465,20 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= Returns ------- - Graph - Created graph. + str or list of str or list of (tuple of str) + + Example + ------- + >>> from gensim.summarization import keywords + >>> text="Challenges in natural language processing frequently involve \ + >>> speech recognition, natural language understanding, natural language \ + >>> generation (frequently from formal, machine-readable logical forms), \ + >>> connecting language and machine perception, dialog systems, or some \ + >>> combination thereof." + >>> print(gensim.summarization.keywords(text)) + natural language + machine + frequently """ @@ -504,7 +517,7 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= def get_graph(text): - """Creates and returns graph with given text. Cleans, tokenizes text + """Creates and returns graph from given text. Cleans, tokenizes text before creating a graph. Parameters @@ -517,6 +530,20 @@ def get_graph(text): Graph Created graph. + + Example + ------- + >>> from gensim.summarization.keywords import get_graph + >>> text = "Fly me to the moon \ + >>> Let me play among the stars \ + >>> Let me see what spring is like \ + >>> On a, Jupiter and Mars" + >>> g = get_graph(text) + >>> print(g.nodes()) + ['fly', 'moon', 'let', 'plai', 'star', 'spring', 'like', 'jupit', 'mar'] + >>> print(g.neighbors("let")) + ['moon', 'star'] + """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) From 254dce7e0d06a021e2641554be443fdee61bac9a Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Thu, 23 Nov 2017 19:16:36 +0300 Subject: [PATCH 09/27] pagerank started --- gensim/summarization/bm25.py | 4 +- gensim/summarization/keywords.py | 66 +++++++++++++---------- gensim/summarization/pagerank_weighted.py | 29 ++++++++++ 3 files changed, 70 insertions(+), 29 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 608acf7a6a..f1e6ae60df 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -7,8 +7,8 @@ corpus and helper class `BM25` used in calculations. -Example: --------- +Example +------- >>> import numpy as np >>> from gensim.summarization.bm25 import get_bm25_weights >>> corpus = [ diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 867311f53f..bfd5121df8 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -3,6 +3,45 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module contains functions to find keywords of the text and building +graph on tokens from text. + + +Examples +-------- +>>> from gensim.summarization import keywords +>>> text="Challenges in natural language processing frequently involve \ +>>> speech recognition, natural language understanding, natural language \ +>>> generation (frequently from formal, machine-readable logical forms), \ +>>> connecting language and machine perception, dialog systems, or some \ +>>> combination thereof." +>>> print(gensim.summarization.keywords(text)) +natural language +machine +frequently + + +>>> from gensim.summarization.keywords import get_graph +>>> text = "Fly me to the moon \ +>>> Let me play among the stars \ +>>> Let me see what spring is like \ +>>> On a, Jupiter and Mars" +>>> g = get_graph(text) +>>> print(g.nodes()) +['fly', 'moon', 'let', 'plai', 'star', 'spring', 'like', 'jupit', 'mar'] +>>> print(g.neighbors("let")) +['moon', 'star'] + + + +Data: +----- +.. data:: WINDOW_SIZE - Size of window, number of consequtive tokens in processing. +.. data:: INCLUDING_FILTER - including part of speech filters. +.. data:: EXCLUDING_FILTER - excluding part of speech filters. + +""" + from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word @@ -466,19 +505,6 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= Returns ------- str or list of str or list of (tuple of str) - - Example - ------- - >>> from gensim.summarization import keywords - >>> text="Challenges in natural language processing frequently involve \ - >>> speech recognition, natural language understanding, natural language \ - >>> generation (frequently from formal, machine-readable logical forms), \ - >>> connecting language and machine perception, dialog systems, or some \ - >>> combination thereof." - >>> print(gensim.summarization.keywords(text)) - natural language - machine - frequently """ @@ -530,20 +556,6 @@ def get_graph(text): Graph Created graph. - - Example - ------- - >>> from gensim.summarization.keywords import get_graph - >>> text = "Fly me to the moon \ - >>> Let me play among the stars \ - >>> Let me see what spring is like \ - >>> On a, Jupiter and Mars" - >>> g = get_graph(text) - >>> print(g.nodes()) - ['fly', 'moon', 'let', 'plai', 'star', 'spring', 'like', 'jupit', 'mar'] - >>> print(g.neighbors("let")) - ['moon', 'star'] - """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index f5a24635a1..2e8e9c2e5d 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -2,6 +2,18 @@ # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" + + +Examples +-------- + + + +""" + + import numpy from numpy import empty as empty_matrix from scipy.linalg import eig @@ -60,6 +72,8 @@ def build_probability_matrix(graph): def principal_eigenvector(a): + + # Note that we prefer to use `eigs` even for dense matrix # because we need only one eigenvector. See #441, #438 for discussion. @@ -74,6 +88,21 @@ def principal_eigenvector(a): def process_results(graph, vec): + """Returns `graph` nodes and corresponding modules of provided eigenvectors. + + Parameters + ---------- + graph : Graph + . + vec : array?? + . + + Returns + ------- + dict + Nodes of graph and adfs + + """ scores = {} for i, node in enumerate(graph.nodes()): scores[node] = abs(vec[i]) From a2c21021b09d33d99befbfe92bc1218a628c2475 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Sat, 25 Nov 2017 20:33:59 +0300 Subject: [PATCH 10/27] pagerank summarizer docstring added --- gensim/summarization/pagerank_weighted.py | 86 +++++++- gensim/summarization/summarizer.py | 257 ++++++++++++++++++++-- 2 files changed, 310 insertions(+), 33 deletions(-) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 2e8e9c2e5d..226a8927ba 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -6,10 +6,22 @@ """ -Examples --------- - - +Example +------- +>>> from gensim.summarization.keywords import get_graph +>>> from gensim.summarization.pagerank_weighted import pagerank_weighted +>>> text = "In graph theory and computer science, an adjacency matrix \ +>>> is a square matrix used to represent a finite graph." +>>> graph = get_graph(text) +>>> pagerank_weighted(graph) +{'adjac': array([ 0.29628575]), + 'finit': array([ 0.29628575]), + 'graph': array([ 0.56766066]), + 'matrix': array([ 0.56766066]), + 'repres': array([ 0.04680678]), + 'scienc': array([ 0.04680678]), + 'squar': array([ 0.29628575]), + 'theori': array([ 0.29628575])} """ @@ -30,6 +42,21 @@ def pagerank_weighted(graph, damping=0.85): + """Returns dictionary of `graph`'s nodes and its ranks. + + Parameters + ---------- + graph : Graph + Given graph. + damping : float + Damping parameter, optional + + Returns + ------- + dict + Keys are `graph` nodes, values are its ranks. + + """ adjacency_matrix = build_adjacency_matrix(graph) probability_matrix = build_probability_matrix(graph) @@ -42,6 +69,19 @@ def pagerank_weighted(graph, damping=0.85): def build_adjacency_matrix(graph): + """Returns matrix representation of given `graph`. + + Parameters + ---------- + graph : Graph + Given graph. + + Returns + ------- + csr_matrix (n, n) + Adjacency matrix of given `graph`. + + """ row = [] col = [] data = [] @@ -62,6 +102,20 @@ def build_adjacency_matrix(graph): def build_probability_matrix(graph): + """Returns square matrix of shape (n, n), where n is number of nodes of the + given `graph`. + + Parameters + ---------- + graph : Graph + Given graph. + + Returns + ------- + array (n, ) + Eigenvector of matrix `a`. + + """ dimension = len(graph.nodes()) matrix = empty_matrix((dimension, dimension)) @@ -72,6 +126,19 @@ def build_probability_matrix(graph): def principal_eigenvector(a): + """Returns eigenvector of square matrix `a`. + + Parameters + ---------- + a : array (n, n) + Given matrix. + + Returns + ------- + array (n, ) + Eigenvector of matrix `a`. + + """ # Note that we prefer to use `eigs` even for dense matrix @@ -88,19 +155,20 @@ def principal_eigenvector(a): def process_results(graph, vec): - """Returns `graph` nodes and corresponding modules of provided eigenvectors. + """Returns `graph` nodes and corresponding absolute values of provided + eigenvector. Parameters ---------- graph : Graph - . - vec : array?? - . + Given graph. + vec : array + Given eigenvector. Returns ------- dict - Nodes of graph and adfs + Keys are graph nodes, values are elements of eigenvector. """ scores = {} diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 2e2d4ed45e..e0d2ae0a82 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -3,6 +3,53 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module provides functions for summarizing texts. Summarizing is based on +ranks of text sentences using BM25 algorithm. + + + +Data: +----- +.. data:: INPUT_MIN_LENGTH - Minimal number of sentences in text +.. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller +weights set to zero. + +Example +------- + +>>> from gensim.summarization.summarizer import summarize +>>> text = ''' +>>> Rice Pudding - Poem by Alan Alexander Milne +>>> +>>> What is the matter with Mary Jane? +>>> She's crying with all her might and main, +>>> And she won't eat her dinner - rice pudding again - +>>> What is the matter with Mary Jane? +>>> What is the matter with Mary Jane? +>>> I've promised her dolls and a daisy-chain, +>>> And a book about animals - all in vain - +>>> What is the matter with Mary Jane? +>>> What is the matter with Mary Jane? +>>> She's perfectly well, and she hasn't a pain; +>>> But, look at her, now she's beginning again! - +>>> What is the matter with Mary Jane? +>>> What is the matter with Mary Jane? +>>> I've promised her sweets and a ride in the train, +>>> And I've begged her to stop for a bit and explain - +>>> What is the matter with Mary Jane? +>>> What is the matter with Mary Jane? +>>> She's perfectly well and she hasn't a pain, +>>> And it's lovely rice pudding for dinner again! +>>> What is the matter with Mary Jane? +>>> ''' +>>> print(summarize(text)) +And she won't eat her dinner - rice pudding again - +I've promised her dolls and a daisy-chain, +I've promised her sweets and a ride in the train, +And it's lovely rice pudding for dinner again! + +""" + import logging from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences @@ -22,6 +69,15 @@ def _set_graph_edge_weights(graph): + """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all + weights are fairly small forces all weights to 1. Works inplace. + + Parameters + ---------- + graph : Graph + Given graph. + + """ documents = graph.nodes() weights = _bm25_weights(documents) @@ -48,6 +104,14 @@ def _set_graph_edge_weights(graph): def _create_valid_graph(graph): + """Sets all weights of edges for different edges as 1. Works inplace. + + Parameters + ---------- + graph : Graph + Given graph. + + """ nodes = graph.nodes() for i in xrange(len(nodes)): @@ -64,10 +128,42 @@ def _create_valid_graph(graph): def _get_doc_length(doc): + """Returns length of (tokenized) document. + + Parameters + ---------- + doc : list of (list of (tuple of int)) + Given document. + + Returns + ------- + int + Length of document. + + """ return sum([item[1] for item in doc]) def _get_similarity(doc1, doc2, vec1, vec2): + """Returns similarity of two documents. + + Parameters + ---------- + doc1 : list of (list of (tuple of int)) + First document. + doc2 : list of (list of (tuple of int)) + Second document. + vec1 : array + ? of first document. + vec1 : array + ? of secont document. + + Returns + ------- + float + Similarity of two documents. + + """ numerator = vec1.dot(vec2.transpose()).toarray()[0][0] length_1 = _get_doc_length(doc1) length_2 = _get_doc_length(doc2) @@ -78,20 +174,65 @@ def _get_similarity(doc1, doc2, vec1, vec2): def _build_corpus(sentences): + """Returns built corpeus from provided sentences. + + Parameters + ---------- + sentences : list of SyntacticUnit + Given senteces. + + Returns + ------- + list of (list of (tuple of int)) + Corpus built from sentences. + + """ split_tokens = [sentence.token.split() for sentence in sentences] dictionary = Dictionary(split_tokens) return [dictionary.doc2bow(token) for token in split_tokens] def _get_important_sentences(sentences, corpus, important_docs): + """Returns most important sentences. + + Parameters + ---------- + sentences : list of SyntacticUnit + Given senteces. + corpus : list of (list of (tuple of int)) + Provided corpus. + important_docs : list of (list of (tuple of int)) + Most important docs of the corpus. + + Returns + ------- + list of SyntacticUnit + Most important sentences. + + """ hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] def _get_sentences_with_word_count(sentences, word_count): - """ Given a list of sentences, returns a list of sentences with a - total word count similar to the word count provided.""" + """Returns list of sentences. Total number of returned words close to + specified `word_count`. + + Parameters + ---------- + sentences : list of SyntacticUnit + Given senteces. + word_count : int or None + Number of returned words. If None full most important sentences will be + returned. + + Returns + ------- + list of SyntacticUnit + Most important sentences. + + """ length = 0 selected_sentences = [] @@ -111,6 +252,26 @@ def _get_sentences_with_word_count(sentences, word_count): def _extract_important_sentences(sentences, corpus, important_docs, word_count): + """Returns most important sentences of the `corpus`. + + Parameters + ---------- + sentences : list of SyntacticUnit + Given senteces. + corpus : list of (list of (tuple of int)) + Provided corpus. + important_docs : list of (list of (tuple of int)) + Most important docs of the corpus. + word_count : int or None + Number of returned words. If None full most important sentences will be + returned. + + Returns + ------- + list SyntacticUnit + Most important sentences. + + """ important_sentences = _get_important_sentences(sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is @@ -119,29 +280,68 @@ def _extract_important_sentences(sentences, corpus, important_docs, word_count): def _format_results(extracted_sentences, split): + """Returns `extracted_sentences` in desired format. + + Parameters + ---------- + extracted_sentences : list of SyntacticUnit + Given senteces. + split : bool + If True senteces will be returned as list. Otherwise senteces will be + merged and returned as string. + + Returns + ------- + str or list of str + Formated result. + + """ if split: return [sentence.text for sentence in extracted_sentences] return "\n".join([sentence.text for sentence in extracted_sentences]) def _build_hasheable_corpus(corpus): + """Hashes and returns `corpus`. + + Parameters + ---------- + corpus : list of (list of (tuple of int)) + Given corpus. + + Returns + ------- + list of (tuple of (tuple of int)) + Hashable corpus. + + """ return [tuple(doc) for doc in corpus] def summarize_corpus(corpus, ratio=0.2): - """ - Returns a list of the most important documents of a corpus using a - variation of the TextRank algorithm. - The input must have at least INPUT_MIN_LENGTH (%d) documents for the - summary to make sense. + """Returns a list of the most important documents of a corpus using a + variation of the TextRank algorithm. The input must have at least + `INPUT_MIN_LENGTH` documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary - (defaults at 20%% of the number of documents of the corpus). + (defaults at 20% of the number of documents of the corpus). + + Parameters + ---------- + corpus : list of (list of (tuple of int)) + Given corpus. + ratio : float + Number between 0 and 1 that determines the proportion of the number of + sentences of the original text to be chosen for the summary. Optional. + + Returns + ------- + str or list of str + Most important documents of given `corpus` sorted by the document score, + highest first. - The most important documents are returned as a list sorted by the - document score, highest first. - """ % INPUT_MIN_LENGTH + """ hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. @@ -171,29 +371,38 @@ def summarize_corpus(corpus, ratio=0.2): def summarize(text, ratio=0.2, word_count=None, split=False): - """ - Returns a summarized version of the given text using a variation of + """Returns a summarized version of the given text using a variation of the TextRank algorithm (see https://arxiv.org/abs/1602.03606). The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. - If the split parameter is set to True, a list of sentences will be - returned instead. The input should be a string, and must be longer than - INPUT_MIN_LENGTH sentences for the summary to make sense. The text + `INPUT_MIN_LENGTH` sentences for the summary to make sense. The text will be split into sentences using the split_sentences method in the - summarization.texcleaner module. - Note that newlines divide sentences. + summarization.texcleaner module. Note that newlines divide sentences. The length of the output can be specified using the ratio and - word_count parameters: - - ratio should be a number between 0 and 1 that determines the - percentage of the number of sentences of the original text to be - chosen for the summary (defaults at 0.2). - word_count determines how many words will the output contain. + word_count parameters. + + Parameters + ---------- + text : str + Given text. + ratio : float + Number between 0 and 1 that determines the proportion of the number of + sentences of the original text to be chosen for the summary. Optional. + word_count : int + Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. + split : bool + If True, list of sentences will be returned. Otherwise joined + strings will bwe returned. + + Returns + ------- + str or list of str + Most representative sentences of given the text. """ # Gets a list of processed sentences. From 1a8793402c67a8fbb3479d7509fa350f3a4b7eba Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Mon, 27 Nov 2017 18:47:25 +0300 Subject: [PATCH 11/27] fixed types in docstrings in commons, bm25, graph and keywords --- gensim/summarization/bm25.py | 25 +++-- gensim/summarization/commons.py | 4 +- gensim/summarization/graph.py | 100 ++++++++++---------- gensim/summarization/keywords.py | 138 ++++++++++++++-------------- gensim/summarization/textcleaner.py | 13 ++- 5 files changed, 151 insertions(+), 129 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index f1e6ae60df..92be95930e 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -3,8 +3,14 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains function of computing BM25 scores for documents in -corpus and helper class `BM25` used in calculations. +"""This module contains function of computing rank scores for documents in +corpus and helper class `BM25` used in calculations. Original alhorithm +descibed in [1]_, also you may check Wikipedia page [2]_. + + +.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf +.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25 + Example @@ -51,11 +57,14 @@ class BM25(object): corpus : list of (list of str) Corpus of documents. f : list of dict - Terms frequencies for each document in `corpus`. + Dictionary with terms frequencies for each document in `corpus`. Words + used as keys and frequencies as values. df : dict - Terms frequencies for whole `corpus`. + Dictionary with terms frequencies for whole `corpus`. Words used as keys + and frequencies as values. idf : dict - Inverse document frequency. + Dictionary with inversed terms frequencies for whole `corpus`. Words + used as keys and frequencies as values. """ @@ -66,7 +75,7 @@ def __init__(self, corpus): Parameters ---------- corpus : list of (list of str) - Corpus of documents. + Given corups. """ self.corpus_size = len(corpus) @@ -79,8 +88,8 @@ def __init__(self, corpus): def initialize(self): - """Calculates frequncies of terms in documents and in corpus. Also - computes inverse document frequncies. + """Calculates frequencies of terms in documents and in corpus. Also + computes inverse document frequencies. """ for document in self.corpus: diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index 7fff08c770..c7cd104e90 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -39,7 +39,7 @@ def build_graph(sequence): Returns ------- - Graph + :class:~gensim.summarization.graph.Graph Created graph. """ @@ -55,7 +55,7 @@ def remove_unreachable_nodes(graph): Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. """ diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index a1b4ea0540..44be0519ef 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -19,7 +19,7 @@ class Graph (based on IGraph) which implements undirected graph. >>> g.nodes() ['Felidae', 'Lion', 'Tiger', 'Wolf'] -Add some edges. Let's check neighbours. +Add some edges and check neighbours. >>> g.add_edge(("Felidae", "Lion")) >>> g.add_edge(("Felidae", "Tiger")) @@ -49,7 +49,7 @@ def nodes(self): Returns ------- - list of node + list of hashable Nodes of graph. """ @@ -62,7 +62,7 @@ def edges(self): Returns ------- - list of (tuple of node) + list of (tuple of hashable) Edges of graph. """ @@ -75,12 +75,12 @@ def neighbors(self, node): Parameters ---------- - node : str or float + node : hashable Given node identifier. Returns ------- - list of node + list of hashable Nodes directly accessible from given `node`. """ @@ -92,7 +92,7 @@ def has_node(self, node): Parameters ---------- - node : str or float + node : hashable Given node identifier. Returns @@ -115,9 +115,9 @@ def add_node(self, node, attrs=None): Parameters ---------- - node : float or str + node : hashable Given node - attrs : list + attrs : list, optional Node attributes specified as (attribute, value) """ @@ -130,13 +130,13 @@ def add_edge(self, edge, wt=1, label='', attrs=None): Parameters ---------- - edge : tuple of node + edge : tuple of hashable Given edge. - wt : float + wt : float, optional Weight of new edge. - label : str + label : str, optional Edge label. - attrs : list + attrs : list, optional Node attributes specified as (attribute, value) """ @@ -148,7 +148,7 @@ def has_edge(self, edge): Parameters ---------- - edge : tuple of node + edge : tuple of hashable Given edge. An edge, here, is a tuple of two nodes. Returns @@ -165,7 +165,7 @@ def edge_weight(self, edge): Parameters ---------- - edge : tuple of node + edge : tuple of hashable Given edge. Returns @@ -178,12 +178,12 @@ def edge_weight(self, edge): @abstractmethod def del_node(self, node): - """Removes node and its edges from graph. + """Removes node and its edges from the graph. Parameters ---------- - node : float or str - Given node. + node : hashable + Node to delete. """ pass @@ -200,9 +200,9 @@ class Graph(IGraph): DEFAULT_WEIGHT : float Weight set by default. LABEL_ATTRIBUTE_NAME : str - Name of attribute. + Default name of attribute. Not used. DEFAULT_LABEL : str - Label set by default. + Label set by default. Not used. """ @@ -213,6 +213,8 @@ class Graph(IGraph): DEFAULT_LABEL = "" def __init__(self): + """Initializes object.""" + # Metadata about edges # Mapping: Edge -> Dict mapping, lablel-> str, wt->num self.edge_properties = {} @@ -230,8 +232,8 @@ def has_edge(self, edge): Parameters ---------- - edge : tuple of node - Given edge. An edge, here, is a tuple of two nodes. + edge : tuple of hashable, size = 2 + Given edge. Returns ------- @@ -247,7 +249,7 @@ def edge_weight(self, edge): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. Returns @@ -263,12 +265,12 @@ def neighbors(self, node): Parameters ---------- - node : float or str + node : hashable Given node identifier. Returns ------- - list of node + list of hashable Nodes directly accessible from given `node`. """ @@ -279,8 +281,8 @@ def has_node(self, node): Parameters ---------- - node : float or str - Given node identifier. + node : hashable + Given node. Returns ------- @@ -296,13 +298,13 @@ def add_edge(self, edge, wt=1, label='', attrs=None): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. - wt : float + wt : float, optional Weight of new edge. - label : str + label : str, optional Edge label. - attrs : list + attrs : list, optional Node attributes specified as (attribute, value). Raises @@ -335,9 +337,9 @@ def add_node(self, node, attrs=None): Parameters ---------- - node : float or str + node : hashable Given node. - attrs : list + attrs : list, optional Node attributes specified as (attribute, value) Raises @@ -355,33 +357,33 @@ def add_node(self, node, attrs=None): raise ValueError("Node %s already in graph" % node) def nodes(self): - return list(self.node_neighbors.keys()) - """Returns all nodes of graph. + """Returns all nodes of the graph. Returns ------- - list of node + list of hashable Nodes of graph. """ + return list(self.node_neighbors.keys()) def edges(self): - """Returns all edges of graph. + """Returns all edges of the graph. Returns ------- - list of edges (tuple of node) + list of (tuple of hashable, size = 2) Edges of graph. """ return [a for a in self.edge_properties.keys()] def del_node(self, node): - """Removes given node and its edges from graph. + """Removes given node and its edges from the graph. Parameters ---------- - node : float or str + node : hashable Given node. """ @@ -397,7 +399,7 @@ def get_edge_properties(self, edge): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. Returns @@ -414,7 +416,7 @@ def add_edge_attributes(self, edge, attrs): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. attrs : list Provided attributes to add. @@ -429,7 +431,7 @@ def add_edge_attribute(self, edge, attr): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. attr : object @@ -447,7 +449,7 @@ def edge_attributes(self, edge): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. Returns @@ -467,10 +469,10 @@ def set_edge_properties(self, edge, **properties): Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. - properties : dictionary + properties : dict Properties to add. """ @@ -479,11 +481,11 @@ def set_edge_properties(self, edge, **properties): self.edge_properties.setdefault((edge[1], edge[0]), {}).update(properties) def del_edge(self, edge): - """Removes given edges from graph. + """Removes given edges from the graph. Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. """ @@ -495,11 +497,11 @@ def del_edge(self, edge): self.del_edge_labeling((v, u)) def del_edge_labeling(self, edge): - """Removes attributes and properties if given edge. + """Removes attributes and properties of given edge. Parameters ---------- - edge : tuple of node + edge : tuple of hashable, size = 2 Given edge. """ diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index bfd5121df8..559e10b3f0 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -10,11 +10,11 @@ Examples -------- >>> from gensim.summarization import keywords ->>> text="Challenges in natural language processing frequently involve \ ->>> speech recognition, natural language understanding, natural language \ ->>> generation (frequently from formal, machine-readable logical forms), \ ->>> connecting language and machine perception, dialog systems, or some \ ->>> combination thereof." +>>> text='''Challenges in natural language processing frequently involve +>>> speech recognition, natural language understanding, natural language +>>> generation (frequently from formal, machine-readable logical forms), +>>> connecting language and machine perception, dialog systems, or some +>>> combination thereof.''' >>> print(gensim.summarization.keywords(text)) natural language machine @@ -22,10 +22,10 @@ >>> from gensim.summarization.keywords import get_graph ->>> text = "Fly me to the moon \ ->>> Let me play among the stars \ ->>> Let me see what spring is like \ ->>> On a, Jupiter and Mars" +>>> text = '''Fly me to the moon +>>> Let me play among the stars +>>> Let me see what spring is like +>>> On a, Jupiter and Mars''' >>> g = get_graph(text) >>> print(g.nodes()) ['fly', 'moon', 'let', 'plai', 'star', 'spring', 'like', 'jupit', 'mar'] @@ -83,9 +83,9 @@ def _get_words_for_graph(tokens, pos_filter=None): Parameters ---------- - tokens : dictionary - Input text. - pos_filter : tuple + tokens : dict + Original units (words) as keys and processed units (tokens) as values. + pos_filter : tuple of list Part of speech filters, optional. Returns @@ -117,12 +117,13 @@ def _get_words_for_graph(tokens, pos_filter=None): def _get_first_window(split_text): - """Returns first `WINDOW_SIZE` tokens from given splitted text. + """Returns first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from + given Splited text. Parameters ---------- split_text : list - Given splitted text. + Given Splited text. Returns ------- @@ -134,19 +135,19 @@ def _get_first_window(split_text): def _set_graph_edge(graph, tokens, word_a, word_b): - """Sets an edge between nodes word_a and word_b if they exists in `tokens` - and `graph`. Works inplace. + """Sets an edge between nodes named word_a and word_b if they exists in + `tokens` and `graph`, inplace. Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. - tokens : Graph - Given tokens. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. word_a : str - First word. + First word, name of first node. word_b : str - Second word. + Second word, name of second node. """ if word_a in tokens and word_b in tokens: @@ -159,17 +160,17 @@ def _set_graph_edge(graph, tokens, word_a, word_b): def _process_first_window(graph, tokens, split_text): - """Sets an edges between nodes taken from first `WINDOW_SIZE` words - of `split_text` if they exist in `tokens` and `graph`. Works inplace. + """Sets an edges between nodes taken from first :const:`~gensim.parsing.keywords.WINDOW_SIZE` + words of `split_text` if they exist in `tokens` and `graph`, inplace. Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. - tokens : Graph - Given tokens. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. split_text : list of str - First word. + Splited text. """ first_window = _get_first_window(split_text) @@ -183,7 +184,7 @@ def _init_queue(split_text): Parameters ---------- split_text : list of str - Splitted text. + Splited text. Returns ------- @@ -204,10 +205,10 @@ def _process_word(graph, tokens, queue, word): Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. - tokens : Graph - Given tokens. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. queue : Queue Given queue. word : str @@ -236,16 +237,16 @@ def _update_queue(queue, word): def _process_text(graph, tokens, split_text): """Processes `split_text` by updating given `graph` with new eges between nodes if they exists in `tokens` and `graph`. Words are taken from - `split_text` with window size `WINDOW_SIZE`. + `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. - tokens : Graph - Given tokens. + tokens : dict + Original units (words) as keys and processed units (tokens) as values. split_text : list of str - Splitted text. + Splited text. """ queue = _init_queue(split_text) for i in xrange(WINDOW_SIZE, len(split_text)): @@ -278,16 +279,16 @@ def _queue_iterator(queue): def _set_graph_edges(graph, tokens, split_text): """Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`. Words are taken from `split_text` with window size - `WINDOW_SIZE`. + :const:`~gensim.parsing.keywords.WINDOW_SIZE`. Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. tokens : dict - Given tokens. + Original units (words) as keys and processed units (tokens) as values. split_text : list of str - Splitted text. + Splited text. """ _process_first_window(graph, tokens, split_text) _process_text(graph, tokens, split_text) @@ -304,7 +305,7 @@ def _extract_tokens(lemmas, scores, ratio, words): scores : dict Dictionary with lemmas and its scores. ratio : float - Proportion of `lemmas` used for final result. + Proportion of lemmas used for final result. words : int Number of used words. If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. @@ -321,17 +322,17 @@ def _extract_tokens(lemmas, scores, ratio, words): def _lemmas_to_words(tokens): - """Extracts words and lemmas from given tokens. + """Returns words and lemmas from given tokens. Produces "reversed" `tokens`. Parameters ---------- tokens : dict - Given tokens. + Original units (words) as keys and processed units (tokens) as values. Returns ------- dict - Keys are lemmas and values are corresponding words. + Lemmas as keys and lists corresponding words as values. """ lemma_to_word = {} @@ -358,7 +359,7 @@ def _get_keywords_with_score(extracted_lemmas, lemma_to_word): Returns ------- dict - Keywords as keys and scores as values. + Keywords as keys and its scores as values. """ @@ -393,10 +394,10 @@ def _get_combined_keywords(_keywords, split_text): Parameters ---------- - _keywords : dict {keywords:scores} - Keywords and its scores. + _keywords : dict + Keywords as keys and its scores as values. split_text : list of str - Splitted text. + Splited text. Returns ------- @@ -432,13 +433,14 @@ def _get_average_score(concept, _keywords): ---------- concept : str Input text. - _keywords : dict {keywords:scores} - Keywords and its scores. + _keywords : dict + Keywords as keys and its scores as values. Returns ------- float Average score. + """ word_list = concept.split() word_counter = 0 @@ -454,8 +456,8 @@ def _format_results(_keywords, combined_keywords, split, scores): Parameters ---------- - _keywords : dict {keywords:scores} - Keywords and its scores. + _keywords : dict + Keywords as keys and its scores as values. combined_keywords : list of str Most ranked words and/or its combinations. split : bool @@ -480,35 +482,33 @@ def _format_results(_keywords, combined_keywords, split, scores): def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): - """Returns most ranked word of provided text and/or its combinations . + """Returns most ranked words of provided text and/or its combinations . Parameters ---------- text : str Sequence of values. - ratio : float + ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. - words : int + words : int, optional Number of returned words. - split : bool - Whether split keywords, optional. - scores : bool - Whether score of keyword, optional. - pos_filter : tuple + split : bool, optional + Whether split keywords. + scores : bool, optional + Whether score of keyword. + pos_filter : tuple, optional Part of speech filters. - lemmatize : bool - Lemmatize words, optional. - deacc : bool - Remove accentuation, optional. + lemmatize : bool, optional + Lemmatize words. + deacc : bool, optional + Remove accentuation. Returns ------- str or list of str or list of (tuple of str) """ - - # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) @@ -543,8 +543,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= def get_graph(text): - """Creates and returns graph from given text. Cleans, tokenizes text - before creating a graph. + """Creates and returns graph from given text. Cleans and tokenizes text + before building graph. Parameters ---------- @@ -553,7 +553,7 @@ def get_graph(text): Returns ------- - Graph + :class:~gensim.summarization.graph.Graph Created graph. """ diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 7ae7b6bc30..bc6a20eafa 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -232,7 +232,8 @@ def clean_text_by_sentences(text): def clean_text_by_word(text, deacc=True): """Tokenizes a given text into words, applying filters and lemmatizing them. - Returns a dictionary of word -> syntacticUnit. + Returns a dictionary of word -> syntacticUnit. Note that different words may + lead to same processed unit. Parameters ---------- @@ -245,6 +246,15 @@ def clean_text_by_word(text, deacc=True): ------- dictionary Word as key, SyntacticUnit as value of dictionary. + + Example + ------- + >>> from gensim.summarization.textcleaner import clean_text_by_word + >>> clean_text_by_word("God helps those who help themselves") + {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', + 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', + 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} + """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) @@ -271,6 +281,7 @@ def tokenize_by_word(text): ------- generator Words contained in processed text. + """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) return tokenize(text_without_acronyms, to_lower=True, deacc=True) From 0ca8332935654c215054d457d5e1fa58b0c14ed4 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Tue, 28 Nov 2017 17:32:17 +0300 Subject: [PATCH 12/27] fixed types, examples and types in docstrings --- gensim/summarization/bm25.py | 8 +- gensim/summarization/commons.py | 6 +- gensim/summarization/keywords.py | 22 ++--- gensim/summarization/pagerank_weighted.py | 56 ++++++----- gensim/summarization/summarizer.py | 99 ++++++++++--------- gensim/summarization/syntactic_unit.py | 2 +- gensim/summarization/textcleaner.py | 115 ++++++++++++++-------- 7 files changed, 172 insertions(+), 136 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 92be95930e..6c6f7f31e5 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -18,10 +18,10 @@ >>> import numpy as np >>> from gensim.summarization.bm25 import get_bm25_weights >>> corpus = [ ->>> ["black", "cat", "white", "cat"], ->>> ["cat", "outer", "space"], ->>> ["wag", "dog"] ->>> ] +... ["black", "cat", "white", "cat"], +... ["cat", "outer", "space"], +... ["wag", "dog"] +... ] >>> np.round(get_bm25_weights(corpus), 3) array([[ 1.282, 0.182, 0. ], [ 0.13 , 1.113, 0. ], diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index c7cd104e90..b202c4f4e2 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -30,11 +30,11 @@ def build_graph(sequence): - """Creates and returns graph with given sequence of values. + """Creates and returns undirected graph with given sequence of values. Parameters ---------- - sequence : list + sequence : list of hashable Sequence of values. Returns @@ -51,7 +51,7 @@ def build_graph(sequence): def remove_unreachable_nodes(graph): - """Removes unreachable nodes (nodes with no edges). Works inplace. + """Removes unreachable nodes (nodes with no edges), inplace. Parameters ---------- diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 559e10b3f0..a44fafe95f 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -11,10 +11,10 @@ -------- >>> from gensim.summarization import keywords >>> text='''Challenges in natural language processing frequently involve ->>> speech recognition, natural language understanding, natural language ->>> generation (frequently from formal, machine-readable logical forms), ->>> connecting language and machine perception, dialog systems, or some ->>> combination thereof.''' +... speech recognition, natural language understanding, natural language +... generation (frequently from formal, machine-readable logical forms), +... connecting language and machine perception, dialog systems, or some +... combination thereof.''' >>> print(gensim.summarization.keywords(text)) natural language machine @@ -23,9 +23,9 @@ >>> from gensim.summarization.keywords import get_graph >>> text = '''Fly me to the moon ->>> Let me play among the stars ->>> Let me see what spring is like ->>> On a, Jupiter and Mars''' +... Let me play among the stars +... Let me see what spring is like +... On a, Jupiter and Mars''' >>> g = get_graph(text) >>> print(g.nodes()) ['fly', 'moon', 'let', 'plai', 'star', 'spring', 'like', 'jupit', 'mar'] @@ -461,7 +461,7 @@ def _format_results(_keywords, combined_keywords, split, scores): combined_keywords : list of str Most ranked words and/or its combinations. split : bool - Whether split result or return string, optional. + Split result if True or return string otherwise, optional. scores : bool Whether return `combined_keywords` with scores, optional. If True `split` is ignored. @@ -494,15 +494,15 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= words : int, optional Number of returned words. split : bool, optional - Whether split keywords. + Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional - Lemmatize words. + Lemmatize words if True. deacc : bool, optional - Remove accentuation. + Remove accentuation if True. Returns ------- diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 226a8927ba..14f44087e0 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -6,22 +6,26 @@ """ -Example -------- +Examples +-------- + >>> from gensim.summarization.keywords import get_graph >>> from gensim.summarization.pagerank_weighted import pagerank_weighted ->>> text = "In graph theory and computer science, an adjacency matrix \ ->>> is a square matrix used to represent a finite graph." ->>> graph = get_graph(text) +>>> graph = get_graph("The road to hell is paved with good intentions.") >>> pagerank_weighted(graph) -{'adjac': array([ 0.29628575]), - 'finit': array([ 0.29628575]), - 'graph': array([ 0.56766066]), - 'matrix': array([ 0.56766066]), - 'repres': array([ 0.04680678]), - 'scienc': array([ 0.04680678]), - 'squar': array([ 0.29628575]), - 'theori': array([ 0.29628575])} +{'good': 0.70432858653171504, + 'hell': 0.051128871128006126, + 'intent': 0.70432858653171504, + 'pave': 0.051128871128006015, + 'road': 0.051128871128006237} + +>>> from gensim.summarization.pagerank_weighted import build_adjacency_matrix +>>> build_adjacency_matrix(graph).todense() +matrix([[ 0., 0., 0., 0., 0.], + [ 0., 0., 1., 0., 0.], + [ 0., 1., 0., 0., 0.], + [ 0., 0., 0., 0., 0.], + [ 0., 0., 0., 0., 0.]]) """ @@ -46,7 +50,7 @@ def pagerank_weighted(graph, damping=0.85): Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. damping : float Damping parameter, optional @@ -54,7 +58,7 @@ def pagerank_weighted(graph, damping=0.85): Returns ------- dict - Keys are `graph` nodes, values are its ranks. + Nodes of `graph` as keys, its ranks as values. """ adjacency_matrix = build_adjacency_matrix(graph) @@ -73,12 +77,12 @@ def build_adjacency_matrix(graph): Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. Returns ------- - csr_matrix (n, n) + :class:scipy.sparse.csr_matrix, shape = [n, n], n is number of nodes Adjacency matrix of given `graph`. """ @@ -107,12 +111,12 @@ def build_probability_matrix(graph): Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. Returns ------- - array (n, ) + array, shape = [n, n], n is number of nodes of `graph` Eigenvector of matrix `a`. """ @@ -130,17 +134,15 @@ def principal_eigenvector(a): Parameters ---------- - a : array (n, n) + a : array, shape = [n, n] Given matrix. Returns ------- - array (n, ) + array, shape = [n, ] Eigenvector of matrix `a`. """ - - # Note that we prefer to use `eigs` even for dense matrix # because we need only one eigenvector. See #441, #438 for discussion. @@ -156,19 +158,19 @@ def principal_eigenvector(a): def process_results(graph, vec): """Returns `graph` nodes and corresponding absolute values of provided - eigenvector. + eigenvector. This function os helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted` Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. - vec : array + vec : array, shape = [n, ], n is number of nodes of `graph` Given eigenvector. Returns ------- dict - Keys are graph nodes, values are elements of eigenvector. + Graph nodes as keys, corresponding elements of eigenvector as values. """ scores = {} diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index e0d2ae0a82..647e36b600 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -4,8 +4,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module provides functions for summarizing texts. Summarizing is based on -ranks of text sentences using BM25 algorithm. - +ranks of text sentences using a variation of the TextRank algorithm (see [1]_ ). Data: @@ -18,36 +17,38 @@ ------- >>> from gensim.summarization.summarizer import summarize ->>> text = ''' ->>> Rice Pudding - Poem by Alan Alexander Milne ->>> ->>> What is the matter with Mary Jane? ->>> She's crying with all her might and main, ->>> And she won't eat her dinner - rice pudding again - ->>> What is the matter with Mary Jane? ->>> What is the matter with Mary Jane? ->>> I've promised her dolls and a daisy-chain, ->>> And a book about animals - all in vain - ->>> What is the matter with Mary Jane? ->>> What is the matter with Mary Jane? ->>> She's perfectly well, and she hasn't a pain; ->>> But, look at her, now she's beginning again! - ->>> What is the matter with Mary Jane? ->>> What is the matter with Mary Jane? ->>> I've promised her sweets and a ride in the train, ->>> And I've begged her to stop for a bit and explain - ->>> What is the matter with Mary Jane? ->>> What is the matter with Mary Jane? ->>> She's perfectly well and she hasn't a pain, ->>> And it's lovely rice pudding for dinner again! ->>> What is the matter with Mary Jane? ->>> ''' +>>> text = '''Rice Pudding - Poem by Alan Alexander Milne +... What is the matter with Mary Jane? +... She's crying with all her might and main, +... And she won't eat her dinner - rice pudding again - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... I've promised her dolls and a daisy-chain, +... And a book about animals - all in vain - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... She's perfectly well, and she hasn't a pain; +... But, look at her, now she's beginning again! - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... I've promised her sweets and a ride in the train, +... And I've begged her to stop for a bit and explain - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... She's perfectly well and she hasn't a pain, +... And it's lovely rice pudding for dinner again! +... What is the matter with Mary Jane?''' >>> print(summarize(text)) And she won't eat her dinner - rice pudding again - I've promised her dolls and a daisy-chain, I've promised her sweets and a ride in the train, And it's lovely rice pudding for dinner again! + +.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). +Variations of the Similarity Function of TextRank for Automated Summarization, +https://arxiv.org/abs/1602.03606 + """ import logging @@ -70,11 +71,11 @@ def _set_graph_edge_weights(graph): """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all - weights are fairly small forces all weights to 1. Works inplace. + weights are fairly small forces all weights to 1, inplace. Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. """ @@ -104,11 +105,11 @@ def _set_graph_edge_weights(graph): def _create_valid_graph(graph): - """Sets all weights of edges for different edges as 1. Works inplace. + """Sets all weights of edges for different edges as 1, inplace. Parameters ---------- - graph : Graph + graph : :class:~gensim.summarization.graph.Graph Given graph. """ @@ -178,7 +179,7 @@ def _build_corpus(sentences): Parameters ---------- - sentences : list of SyntacticUnit + sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given senteces. Returns @@ -197,16 +198,16 @@ def _get_important_sentences(sentences, corpus, important_docs): Parameters ---------- - sentences : list of SyntacticUnit + sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given senteces. corpus : list of (list of (tuple of int)) Provided corpus. important_docs : list of (list of (tuple of int)) - Most important docs of the corpus. + Most important documents of the corpus. Returns ------- - list of SyntacticUnit + list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Most important sentences. """ @@ -221,7 +222,7 @@ def _get_sentences_with_word_count(sentences, word_count): Parameters ---------- - sentences : list of SyntacticUnit + sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given senteces. word_count : int or None Number of returned words. If None full most important sentences will be @@ -229,7 +230,7 @@ def _get_sentences_with_word_count(sentences, word_count): Returns ------- - list of SyntacticUnit + list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Most important sentences. """ @@ -256,19 +257,19 @@ def _extract_important_sentences(sentences, corpus, important_docs, word_count): Parameters ---------- - sentences : list of SyntacticUnit + sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given senteces. corpus : list of (list of (tuple of int)) Provided corpus. important_docs : list of (list of (tuple of int)) Most important docs of the corpus. - word_count : int or None + word_count : int Number of returned words. If None full most important sentences will be returned. Returns ------- - list SyntacticUnit + list :class:~gensim.summarization.syntactic_unit.SyntacticUnit Most important sentences. """ @@ -284,7 +285,7 @@ def _format_results(extracted_sentences, split): Parameters ---------- - extracted_sentences : list of SyntacticUnit + extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given senteces. split : bool If True senteces will be returned as list. Otherwise senteces will be @@ -320,9 +321,11 @@ def _build_hasheable_corpus(corpus): def summarize_corpus(corpus, ratio=0.2): """Returns a list of the most important documents of a corpus using a - variation of the TextRank algorithm. The input must have at least - `INPUT_MIN_LENGTH` documents for the summary to make sense. - + variation of the TextRank algorithm. Used as helper for summarize + :func:`~gensim.summarization.summarizer.summarizer` + + The input must have at least + :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20% of the number of documents of the corpus). @@ -333,7 +336,7 @@ def summarize_corpus(corpus, ratio=0.2): Given corpus. ratio : float Number between 0 and 1 that determines the proportion of the number of - sentences of the original text to be chosen for the summary. Optional. + sentences of the original text to be chosen for the summary, optional. Returns ------- @@ -371,19 +374,19 @@ def summarize_corpus(corpus, ratio=0.2): def summarize(text, ratio=0.2, word_count=None, split=False): - """Returns a summarized version of the given text using a variation of - the TextRank algorithm (see https://arxiv.org/abs/1602.03606). + """Returns a summarized version of the given text. The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. The input should be a string, and must be longer than - `INPUT_MIN_LENGTH` sentences for the summary to make sense. The text + :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for + the summary to make sense. The text will be split into sentences using the split_sentences method in the summarization.texcleaner module. Note that newlines divide sentences. The length of the output can be specified using the ratio and - word_count parameters. + `word_count` parameters. Parameters ---------- diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 492f141ad6..9d02d422ea 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -29,7 +29,7 @@ class SyntacticUnit(object): index : int Index of sytactic unit in corpus, optional. score : float - Score (BM25) of synctatic unit, optional. + Score of synctatic unit, optional. """ diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index bc6a20eafa..c470f9c912 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -3,9 +3,7 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Text Cleaner - -This module contains functions and processors used for processing text, +"""This module contains functions and processors used for processing text, extracting sentences from text, working with acronyms and abbreviations. """ @@ -49,7 +47,9 @@ def split_sentences(text): """Splits and returns list of sentences from given text. It preserves - abbreviations set in `AB_SENIOR` and `AB_ACRONYM`. + abbreviations set in + :const:`~gensim.summarization.textcleaner.AB_SENIOR` and + :const:`~gensim.summarization.textcleaner.AB_ACRONYM`. Parameters ---------- @@ -58,15 +58,26 @@ def split_sentences(text): Returns ------- - str: - List of sentences from text. + list of str + Sentences of given text. + + Example + ------- + >>> from gensim.summarization.textcleaner import split_sentences + >>> text = '''Beautiful is better than ugly. + ... Explicit is better than implicit. Simple is better than complex.''' + >>> split_sentences(text) + ['Beautiful is better than ugly.', + 'Explicit is better than implicit.', + 'Simple is better than complex.'] + """ processed = replace_abbreviations(text) return [undo_replacement(sentence) for sentence in get_sentences(processed)] def replace_abbreviations(text): - """Replaces blank space to @ separator after abbreviation and next word. + """Replaces blank space to '@' separator after abbreviation and next word. Parameters ---------- @@ -75,13 +86,14 @@ def replace_abbreviations(text): Returns ------- - str: + str Sentence with changed separator. Example ------- >>> replace_abbreviations("God bless you, please, Mrs. Robinson") God bless you, please, Mrs.@Robinson + """ return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) @@ -96,20 +108,21 @@ def undo_replacement(sentence): Returns ------- - str: + str Sentence with changed separator. Example ------- >>> undo_replacement("God bless you, please, Mrs.@Robinson") God bless you, please, Mrs. Robinson + """ return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) def replace_with_separator(text, separator, regexs): """Returns text with replaced separator if provided regular expressions - were matched. + were matched. Used as helper in other reaplcers. Parameters ---------- @@ -117,13 +130,14 @@ def replace_with_separator(text, separator, regexs): Input text. separator : str The separator between words to be replaced. - regexs : str - List of regular expressions. + regexs : list of _sre.SRE_Pattern + Regular expressions used in processing text. Returns ------- str Text with replaced separators. + """ replacement = r"\1" + separator + r"\2" result = text @@ -133,7 +147,8 @@ def replace_with_separator(text, separator, regexs): def get_sentences(text): - """Sentence generator from provided text. Sentence pattern set in `RE_SENTENCE`. + """Sentence generator from provided text. Sentence pattern set in + :const:`~gensim.summarization.textcleaner.RE_SENTENCE`. Parameters ---------- @@ -147,11 +162,12 @@ def get_sentences(text): Example ------- - >>> text = "Does this text contains two sentences? Yes, it is." + >>> text = "Does this text contains two sentences? Yes, it does." >>> for sentence in get_sentences(text): >>> print(sentence) Does this text contains two sentences? - Yes, it is. + Yes, it does. + """ for match in RE_SENTENCE.finditer(text): yield match.group() @@ -160,7 +176,7 @@ def get_sentences(text): def merge_syntactic_units(original_units, filtered_units, tags=None): """Processes given sentences and its filtered (tokenized) copies into SyntacticUnit type. Also adds tags if they are provided to produced units. - Returns a SyntacticUnit list. + Returns list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. Parameters ---------- @@ -168,13 +184,14 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): List of original sentences. filtered_units : list List of tokenized sentences. - tags : list + tags : list of str, optional List of strings used as tags for each unit. None as deafault. Returns ------- - list - SyntacticUnit for each input item. + list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + List of syntactic units (sentences). + """ units = [] for i in xrange(len(original_units)): @@ -193,36 +210,38 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): def join_words(words, separator=" "): - """Merges words to a string using separator (blank space as default). + """Concatenates `words` with `separator` between elements. Parameters ---------- - words : list - List of words. - separator : str - The separator bertween elements. Blank set as default. + words : list of str + Given words. + separator : str, optional + The separator between elements. Blank space set as default. Returns ------- str - String of merged words with separator between them. + String of merged words with separator between elements. + """ return separator.join(words) def clean_text_by_sentences(text): """Tokenizes a given text into sentences, applying filters and lemmatizing them. - Returns a SyntacticUnit list. + Returns a list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. Parameters ---------- - text : list - Input text. + text : str + Given text. Returns ------- - list - SyntacticUnit objects for each sentence. + list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + Sentences of the given text. + """ original_sentences = split_sentences(text) filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] @@ -232,20 +251,20 @@ def clean_text_by_sentences(text): def clean_text_by_word(text, deacc=True): """Tokenizes a given text into words, applying filters and lemmatizing them. - Returns a dictionary of word -> syntacticUnit. Note that different words may - lead to same processed unit. + Returns a dictionary with words as keys and :class:~gensim.summarization.syntactic_unit.SyntacticUnit + as values. Note that different words may lead to same processed units. Parameters ---------- - text : list - Input text. - deacc : bool - Remove accentuation (default True). + text : str + Given text. + deacc : bool, optional + Remove accentuation if True. Returns ------- - dictionary - Word as key, SyntacticUnit as value of dictionary. + dict + Words as keys, :class:~gensim.summarization.syntactic_unit.SyntacticUnit as values. Example ------- @@ -269,18 +288,30 @@ def clean_text_by_word(text, deacc=True): def tokenize_by_word(text): """Tokenizes input text. Before tokenizing transforms text to lower case and - removes accentuation and acronyms set `AB_ACRONYM_LETTERS`. + removes accentuation and acronyms set + :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. Returns generator of words. Parameters ---------- - text : list - Input text. + text : str + Given text. Returns ------- generator - Words contained in processed text. + Generator that yields sequence words of the given text. + + Example + ------- + >>> from gensim.summarization.textcleaner import tokenize_by_word + >>> g = tokenize_by_word('Veni. Vedi. Vici.') + >>> print(next(g)) + veni + >>> print(next(g)) + vedi + >>> print(next(g)) + vici """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) From 20b19d6e1fb3f468135465ffd58fd5b4f43e28e7 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 22:33:53 +0500 Subject: [PATCH 13/27] fix pep8 --- gensim/summarization/bm25.py | 21 +++---- gensim/summarization/graph.py | 42 ++++++------- gensim/summarization/keywords.py | 50 +++++++-------- gensim/summarization/pagerank_weighted.py | 6 +- gensim/summarization/summarizer.py | 76 +++++++++++------------ gensim/summarization/syntactic_unit.py | 5 +- gensim/summarization/textcleaner.py | 63 +++++++++---------- 7 files changed, 129 insertions(+), 134 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 479be78118..bac2d8ed55 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -3,12 +3,13 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains function of computing rank scores for documents in -corpus and helper class `BM25` used in calculations. Original alhorithm +"""This module contains function of computing rank scores for documents in +corpus and helper class `BM25` used in calculations. Original alhorithm descibed in [1]_, also you may check Wikipedia page [2]_. -.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf +.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, + http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf .. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25 @@ -57,18 +58,17 @@ class BM25(object): corpus : list of (list of str) Corpus of documents. f : list of dict - Dictionary with terms frequencies for each document in `corpus`. Words + Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values. df : dict Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values. idf : dict - Dictionary with inversed terms frequencies for whole `corpus`. Words + Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values. """ - def __init__(self, corpus): """Presets atributes and runs initialize() function. @@ -86,7 +86,6 @@ def __init__(self, corpus): self.idf = {} self.initialize() - def initialize(self): """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies. @@ -108,7 +107,6 @@ def initialize(self): for word, freq in iteritems(self.df): self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) - def get_score(self, document, index, average_idf): """Computes BM25 score of given `document` in relation to item of corpus selected by `index`. @@ -137,10 +135,9 @@ def get_score(self, document, index, average_idf): / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * len(document) / self.avgdl))) return score - def get_scores(self, document, average_idf): - """Computes and returns BM25 scores of given `document` in relation to - every item in corpus. + """Computes and returns BM25 scores of given `document` in relation to + every item in corpus. Parameters ---------- @@ -164,7 +161,7 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): """Returns BM25 scores (weights) of documents in corpus. Each document - has to be weighted with every document in given corpus. + has to be weighted with every document in given corpus. Parameters ---------- diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index 44be0519ef..c3bc473e7c 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -106,7 +106,7 @@ def has_node(self, node): @abstractmethod def add_node(self, node, attrs=None): """Adds given node to the graph. - + Note ---- While nodes can be of any type, it's strongly recommended @@ -125,7 +125,7 @@ def add_node(self, node, attrs=None): @abstractmethod def add_edge(self, edge, wt=1, label='', attrs=None): - """Adds an edge to the graph connecting two nodes. An edge, here, + """Adds an edge to the graph connecting two nodes. An edge, here, is a tuple of two nodes. Parameters @@ -184,7 +184,7 @@ def del_node(self, node): ---------- node : hashable Node to delete. - + """ pass @@ -293,7 +293,7 @@ def has_node(self, node): return node in self.node_neighbors def add_edge(self, edge, wt=1, label='', attrs=None): - """Adds an edge to the graph connecting two nodes. An edge, here, + """Adds an edge to the graph connecting two nodes. An edge, here, is a tuple of two nodes. Parameters @@ -328,7 +328,7 @@ def add_edge(self, edge, wt=1, label='', attrs=None): def add_node(self, node, attrs=None): """Adds given node to the graph. - + Note ---- While nodes can be of any type, it's strongly recommended @@ -363,7 +363,7 @@ def nodes(self): ------- list of hashable Nodes of graph. - + """ return list(self.node_neighbors.keys()) @@ -374,7 +374,7 @@ def edges(self): ------- list of (tuple of hashable, size = 2) Edges of graph. - + """ return [a for a in self.edge_properties.keys()] @@ -385,7 +385,7 @@ def del_node(self, node): ---------- node : hashable Given node. - + """ for each in list(self.neighbors(node)): if each != node: @@ -394,7 +394,7 @@ def del_node(self, node): del self.node_attr[node] def get_edge_properties(self, edge): - """Returns properties of given given edge. If edge doesn't exist + """Returns properties of given given edge. If edge doesn't exist empty dictionary will be returned. Parameters @@ -406,12 +406,12 @@ def get_edge_properties(self, edge): ------- dict Properties of graph. - + """ return self.edge_properties.setdefault(edge, {}) def add_edge_attributes(self, edge, attrs): - """Adds attributes `attrs`to given edge. Order of nodes in edge doesn't + """Adds attributes `attrs`to given edge. Order of nodes in edge doesn't matter. Parameters @@ -420,13 +420,13 @@ def add_edge_attributes(self, edge, attrs): Given edge. attrs : list Provided attributes to add. - + """ for attr in attrs: self.add_edge_attribute(edge, attr) def add_edge_attribute(self, edge, attr): - """Adds attribute `attr`to given edge. Order of nodes in edge doesn't + """Adds attribute `attr`to given edge. Order of nodes in edge doesn't matter. Parameters @@ -436,7 +436,7 @@ def add_edge_attribute(self, edge, attr): attr : object Provided attribute to add. - + """ self.edge_attr[edge] = self.edge_attributes(edge) + [attr] @@ -444,19 +444,19 @@ def add_edge_attribute(self, edge, attr): self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] def edge_attributes(self, edge): - """Returns attributes of given edge. In case of non existing edge + """Returns attributes of given edge. In case of non existing edge returns empty list. Parameters ---------- edge : tuple of hashable, size = 2 Given edge. - + Returns ------- list Attributes of given edge. - + """ try: return self.edge_attr[edge] @@ -464,7 +464,7 @@ def edge_attributes(self, edge): return [] def set_edge_properties(self, edge, **properties): - """Adds `properties` to given edge. Order of nodes in edge doesn't + """Adds `properties` to given edge. Order of nodes in edge doesn't matter. Parameters @@ -474,7 +474,7 @@ def set_edge_properties(self, edge, **properties): properties : dict Properties to add. - + """ self.edge_properties.setdefault(edge, {}).update(properties) if edge[0] != edge[1]: @@ -487,7 +487,7 @@ def del_edge(self, edge): ---------- edge : tuple of hashable, size = 2 Given edge. - + """ u, v = edge self.node_neighbors[u].remove(v) @@ -503,7 +503,7 @@ def del_edge_labeling(self, edge): ---------- edge : tuple of hashable, size = 2 Given edge. - + """ keys = [edge, edge[::-1]] diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index a44fafe95f..54f08fbf0d 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -67,7 +67,7 @@ def _get_pos_filters(): """Returns default including and excluding filters as frozen sets. - + Returns ------- tuple of frozenset @@ -87,7 +87,7 @@ def _get_words_for_graph(tokens, pos_filter=None): Original units (words) as keys and processed units (tokens) as values. pos_filter : tuple of list Part of speech filters, optional. - + Returns ------- list @@ -117,25 +117,25 @@ def _get_words_for_graph(tokens, pos_filter=None): def _get_first_window(split_text): - """Returns first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from + """Returns first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given Splited text. Parameters ---------- split_text : list Given Splited text. - + Returns ------- tuple of frozenset Including and excluding filters. - + """ return split_text[:WINDOW_SIZE] def _set_graph_edge(graph, tokens, word_a, word_b): - """Sets an edge between nodes named word_a and word_b if they exists in + """Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace. Parameters @@ -148,7 +148,7 @@ def _set_graph_edge(graph, tokens, word_a, word_b): First word, name of first node. word_b : str Second word, name of second node. - + """ if word_a in tokens and word_b in tokens: lemma_a = tokens[word_a].token @@ -171,7 +171,7 @@ def _process_first_window(graph, tokens, split_text): Original units (words) as keys and processed units (tokens) as values. split_text : list of str Splited text. - + """ first_window = _get_first_window(split_text) for word_a, word_b in _combinations(first_window, 2): @@ -179,7 +179,7 @@ def _process_first_window(graph, tokens, split_text): def _init_queue(split_text): - """Initializies queue by first words from `split_text`. + """Initializies queue by first words from `split_text`. Parameters ---------- @@ -190,7 +190,7 @@ def _init_queue(split_text): ------- Queue Initialized queue. - + """ queue = _Queue() first_window = _get_first_window(split_text) @@ -201,7 +201,7 @@ def _init_queue(split_text): def _process_word(graph, tokens, queue, word): """Sets edge between `word` and each element in queue in `graph` if such nodes - exist in `tokens` and `graph`. + exist in `tokens` and `graph`. Parameters ---------- @@ -235,8 +235,8 @@ def _update_queue(queue, word): def _process_text(graph, tokens, split_text): - """Processes `split_text` by updating given `graph` with new eges between - nodes if they exists in `tokens` and `graph`. Words are taken from + """Processes `split_text` by updating given `graph` with new eges between + nodes if they exists in `tokens` and `graph`. Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. Parameters @@ -267,7 +267,7 @@ def _queue_iterator(queue): ------ str Current item of queue. - + """ iterations = queue.qsize() for _ in xrange(iterations): @@ -277,7 +277,7 @@ def _queue_iterator(queue): def _set_graph_edges(graph, tokens, split_text): - """Updates given `graph` by setting eges between nodes if they exists in + """Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`. Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. @@ -295,7 +295,7 @@ def _set_graph_edges(graph, tokens, split_text): def _extract_tokens(lemmas, scores, ratio, words): - """Extracts tokens from provided lemmas. Most scored lemmas are used if + """Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided. Parameters @@ -305,9 +305,9 @@ def _extract_tokens(lemmas, scores, ratio, words): scores : dict Dictionary with lemmas and its scores. ratio : float - Proportion of lemmas used for final result. + Proportion of lemmas used for final result. words : int - Number of used words. If no "words" option is selected, the number of + Number of used words. If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. Returns @@ -333,7 +333,7 @@ def _lemmas_to_words(tokens): ------- dict Lemmas as keys and lists corresponding words as values. - + """ lemma_to_word = {} for word, unit in iteritems(tokens): @@ -360,7 +360,7 @@ def _get_keywords_with_score(extracted_lemmas, lemma_to_word): ------- dict Keywords as keys and its scores as values. - + """ keywords = {} @@ -378,7 +378,7 @@ def _strip_word(word): ---------- word : str Given word. - + Returns ------- str @@ -398,7 +398,7 @@ def _get_combined_keywords(_keywords, split_text): Keywords as keys and its scores as values. split_text : list of str Splited text. - + Returns ------- list of str @@ -435,7 +435,7 @@ def _get_average_score(concept, _keywords): Input text. _keywords : dict Keywords as keys and its scores as values. - + Returns ------- float @@ -463,7 +463,7 @@ def _format_results(_keywords, combined_keywords, split, scores): split : bool Split result if True or return string otherwise, optional. scores : bool - Whether return `combined_keywords` with scores, optional. If True + Whether return `combined_keywords` with scores, optional. If True `split` is ignored. Returns @@ -543,7 +543,7 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= def get_graph(text): - """Creates and returns graph from given text. Cleans and tokenizes text + """Creates and returns graph from given text. Cleans and tokenizes text before building graph. Parameters diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 14f44087e0..68410fbc66 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -9,7 +9,7 @@ Examples -------- ->>> from gensim.summarization.keywords import get_graph +>>> from gensim.summarization.keywords import get_graph >>> from gensim.summarization.pagerank_weighted import pagerank_weighted >>> graph = get_graph("The road to hell is paved with good intentions.") >>> pagerank_weighted(graph) @@ -73,7 +73,7 @@ def pagerank_weighted(graph, damping=0.85): def build_adjacency_matrix(graph): - """Returns matrix representation of given `graph`. + """Returns matrix representation of given `graph`. Parameters ---------- @@ -157,7 +157,7 @@ def principal_eigenvector(a): def process_results(graph, vec): - """Returns `graph` nodes and corresponding absolute values of provided + """Returns `graph` nodes and corresponding absolute values of provided eigenvector. This function os helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted` Parameters diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 4c65a14531..f2c6ff50eb 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -18,34 +18,34 @@ >>> from gensim.summarization.summarizer import summarize >>> text = '''Rice Pudding - Poem by Alan Alexander Milne -... What is the matter with Mary Jane? -... She's crying with all her might and main, -... And she won't eat her dinner - rice pudding again - -... What is the matter with Mary Jane? -... What is the matter with Mary Jane? -... I've promised her dolls and a daisy-chain, -... And a book about animals - all in vain - -... What is the matter with Mary Jane? -... What is the matter with Mary Jane? -... She's perfectly well, and she hasn't a pain; -... But, look at her, now she's beginning again! - -... What is the matter with Mary Jane? -... What is the matter with Mary Jane? -... I've promised her sweets and a ride in the train, -... And I've begged her to stop for a bit and explain - -... What is the matter with Mary Jane? -... What is the matter with Mary Jane? -... She's perfectly well and she hasn't a pain, -... And it's lovely rice pudding for dinner again! +... What is the matter with Mary Jane? +... She's crying with all her might and main, +... And she won't eat her dinner - rice pudding again - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... I've promised her dolls and a daisy-chain, +... And a book about animals - all in vain - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... She's perfectly well, and she hasn't a pain; +... But, look at her, now she's beginning again! - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... I've promised her sweets and a ride in the train, +... And I've begged her to stop for a bit and explain - +... What is the matter with Mary Jane? +... What is the matter with Mary Jane? +... She's perfectly well and she hasn't a pain, +... And it's lovely rice pudding for dinner again! ... What is the matter with Mary Jane?''' >>> print(summarize(text)) -And she won't eat her dinner - rice pudding again - -I've promised her dolls and a daisy-chain, -I've promised her sweets and a ride in the train, +And she won't eat her dinner - rice pudding again - +I've promised her dolls and a daisy-chain, +I've promised her sweets and a ride in the train, And it's lovely rice pudding for dinner again! -.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). +.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). Variations of the Similarity Function of TextRank for Automated Summarization, https://arxiv.org/abs/1602.03606 @@ -77,7 +77,7 @@ def _set_graph_edge_weights(graph): ---------- graph : :class:~gensim.summarization.graph.Graph Given graph. - + """ documents = graph.nodes() weights = _bm25_weights(documents) @@ -111,7 +111,7 @@ def _create_valid_graph(graph): ---------- graph : :class:~gensim.summarization.graph.Graph Given graph. - + """ nodes = graph.nodes() @@ -217,7 +217,7 @@ def _get_important_sentences(sentences, corpus, important_docs): def _get_sentences_with_word_count(sentences, word_count): - """Returns list of sentences. Total number of returned words close to + """Returns list of sentences. Total number of returned words close to specified `word_count`. Parameters @@ -253,7 +253,7 @@ def _get_sentences_with_word_count(sentences, word_count): def _extract_important_sentences(sentences, corpus, important_docs, word_count): - """Returns most important sentences of the `corpus`. + """Returns most important sentences of the `corpus`. Parameters ---------- @@ -290,7 +290,7 @@ def _format_results(extracted_sentences, split): extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given senteces. split : bool - If True senteces will be returned as list. Otherwise senteces will be + If True senteces will be returned as list. Otherwise senteces will be merged and returned as string. Returns @@ -323,10 +323,10 @@ def _build_hasheable_corpus(corpus): def summarize_corpus(corpus, ratio=0.2): """Returns a list of the most important documents of a corpus using a - variation of the TextRank algorithm. Used as helper for summarize + variation of the TextRank algorithm. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` - - The input must have at least + + The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary @@ -336,8 +336,8 @@ def summarize_corpus(corpus, ratio=0.2): ---------- corpus : list of (list of (tuple of int)) Given corpus. - ratio : float - Number between 0 and 1 that determines the proportion of the number of + ratio : float + Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns @@ -382,7 +382,7 @@ def summarize(text, ratio=0.2, word_count=None, split=False): and will be returned as a string, divided by newlines. The input should be a string, and must be longer than - :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for + :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for the summary to make sense. The text will be split into sentences using the split_sentences method in the summarization.texcleaner module. Note that newlines divide sentences. @@ -394,14 +394,14 @@ def summarize(text, ratio=0.2, word_count=None, split=False): ---------- text : str Given text. - ratio : float - Number between 0 and 1 that determines the proportion of the number of + ratio : float + Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary. Optional. - word_count : int + word_count : int Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. split : bool - If True, list of sentences will be returned. Otherwise joined + If True, list of sentences will be returned. Otherwise joined strings will bwe returned. Returns diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 9d02d422ea..537e403723 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -3,8 +3,8 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains implementation of SyntacticUnit class. It generally used -while text cleaning. SyntacticUnit represents printable version of provided +"""This module contains implementation of SyntacticUnit class. It generally used +while text cleaning. SyntacticUnit represents printable version of provided text. @@ -15,6 +15,7 @@ """ + class SyntacticUnit(object): """SyntacticUnit class. diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index c470f9c912..ffaecb2217 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -3,7 +3,7 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains functions and processors used for processing text, +"""This module contains functions and processors used for processing text, extracting sentences from text, working with acronyms and abbreviations. """ @@ -28,34 +28,31 @@ SEPARATOR = r'@' """str: special separator used in abbreviations.""" -RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) """SRE_Pattern: pattern to split text to sentences.""" AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) """SRE_Pattern: pattern for detecting abbreviations. (Example: Sgt. Pepper)""" AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) """SRE_Pattern: one more pattern for detecting acronyms.""" AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE) -"""SRE_Pattern: one more pattern for detecting acronyms. -(Example: P.S. I love you)""" +"""SRE_Pattern: one more pattern for detecting acronyms. (Example: P.S. I love you)""" UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) -"""SRE_Pattern: Pattern like AB_SENIOR but with SEPARATOR between abbreviation -and next word""" +"""SRE_Pattern: Pattern like AB_SENIOR but with SEPARATOR between abbreviation and next word""" UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) -"""SRE_Pattern: Pattern like AB_ACRONYM but with SEPARATOR between abbreviation -and next word""" +"""SRE_Pattern: Pattern like AB_ACRONYM but with SEPARATOR between abbreviation and next word""" def split_sentences(text): - """Splits and returns list of sentences from given text. It preserves - abbreviations set in - :const:`~gensim.summarization.textcleaner.AB_SENIOR` and + """Splits and returns list of sentences from given text. It preserves + abbreviations set in + :const:`~gensim.summarization.textcleaner.AB_SENIOR` and :const:`~gensim.summarization.textcleaner.AB_ACRONYM`. Parameters ---------- text : str Input text. - + Returns ------- list of str @@ -64,7 +61,7 @@ def split_sentences(text): Example ------- >>> from gensim.summarization.textcleaner import split_sentences - >>> text = '''Beautiful is better than ugly. + >>> text = '''Beautiful is better than ugly. ... Explicit is better than implicit. Simple is better than complex.''' >>> split_sentences(text) ['Beautiful is better than ugly.', @@ -83,17 +80,17 @@ def replace_abbreviations(text): ---------- sentence : str Input sentence. - + Returns ------- str Sentence with changed separator. - + Example ------- >>> replace_abbreviations("God bless you, please, Mrs. Robinson") God bless you, please, Mrs.@Robinson - + """ return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) @@ -105,12 +102,12 @@ def undo_replacement(sentence): ---------- sentence : str Input sentence. - + Returns ------- str Sentence with changed separator. - + Example ------- >>> undo_replacement("God bless you, please, Mrs.@Robinson") @@ -121,7 +118,7 @@ def undo_replacement(sentence): def replace_with_separator(text, separator, regexs): - """Returns text with replaced separator if provided regular expressions + """Returns text with replaced separator if provided regular expressions were matched. Used as helper in other reaplcers. Parameters @@ -132,7 +129,7 @@ def replace_with_separator(text, separator, regexs): The separator between words to be replaced. regexs : list of _sre.SRE_Pattern Regular expressions used in processing text. - + Returns ------- str @@ -147,14 +144,14 @@ def replace_with_separator(text, separator, regexs): def get_sentences(text): - """Sentence generator from provided text. Sentence pattern set in + """Sentence generator from provided text. Sentence pattern set in :const:`~gensim.summarization.textcleaner.RE_SENTENCE`. Parameters ---------- text : str Input text. - + Yields ------ str @@ -174,9 +171,9 @@ def get_sentences(text): def merge_syntactic_units(original_units, filtered_units, tags=None): - """Processes given sentences and its filtered (tokenized) copies into + """Processes given sentences and its filtered (tokenized) copies into SyntacticUnit type. Also adds tags if they are provided to produced units. - Returns list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. + Returns list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. Parameters ---------- @@ -186,7 +183,7 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): List of tokenized sentences. tags : list of str, optional List of strings used as tags for each unit. None as deafault. - + Returns ------- list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit @@ -218,7 +215,7 @@ def join_words(words, separator=" "): Given words. separator : str, optional The separator between elements. Blank space set as default. - + Returns ------- str @@ -230,13 +227,13 @@ def join_words(words, separator=" "): def clean_text_by_sentences(text): """Tokenizes a given text into sentences, applying filters and lemmatizing them. - Returns a list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. + Returns a list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. Parameters ---------- text : str Given text. - + Returns ------- list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit @@ -251,7 +248,7 @@ def clean_text_by_sentences(text): def clean_text_by_word(text, deacc=True): """Tokenizes a given text into words, applying filters and lemmatizing them. - Returns a dictionary with words as keys and :class:~gensim.summarization.syntactic_unit.SyntacticUnit + Returns a dictionary with words as keys and :class:~gensim.summarization.syntactic_unit.SyntacticUnit as values. Note that different words may lead to same processed units. Parameters @@ -260,7 +257,7 @@ def clean_text_by_word(text, deacc=True): Given text. deacc : bool, optional Remove accentuation if True. - + Returns ------- dict @@ -288,15 +285,15 @@ def clean_text_by_word(text, deacc=True): def tokenize_by_word(text): """Tokenizes input text. Before tokenizing transforms text to lower case and - removes accentuation and acronyms set - :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. + removes accentuation and acronyms set + :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. Returns generator of words. Parameters ---------- text : str Given text. - + Returns ------- generator From 6ec29bfe6aee0d3d52e32a722ec4275d3313552d Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 22:58:13 +0500 Subject: [PATCH 14/27] fix doc build --- gensim/summarization/graph.py | 9 +++------ gensim/summarization/summarizer.py | 17 ++++++++--------- gensim/summarization/textcleaner.py | 21 +++++++++++++-------- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index c3bc473e7c..4509878b74 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -411,8 +411,7 @@ def get_edge_properties(self, edge): return self.edge_properties.setdefault(edge, {}) def add_edge_attributes(self, edge, attrs): - """Adds attributes `attrs`to given edge. Order of nodes in edge doesn't - matter. + """Adds attributes `attrs` to given edge. Order of nodes in edge doesn't matter. Parameters ---------- @@ -426,8 +425,7 @@ def add_edge_attributes(self, edge, attrs): self.add_edge_attribute(edge, attr) def add_edge_attribute(self, edge, attr): - """Adds attribute `attr`to given edge. Order of nodes in edge doesn't - matter. + """Adds attribute `attr` to given edge. Order of nodes in edge doesn't matter. Parameters ---------- @@ -444,8 +442,7 @@ def add_edge_attribute(self, edge, attr): self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] def edge_attributes(self, edge): - """Returns attributes of given edge. In case of non existing edge - returns empty list. + """Returns attributes of given edge. In case of non existing edge returns empty list. Parameters ---------- diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index f2c6ff50eb..21f7d5c944 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -6,12 +6,16 @@ """This module provides functions for summarizing texts. Summarizing is based on ranks of text sentences using a variation of the TextRank algorithm (see [1]_ ). +.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). + Variations of the Similarity Function of TextRank for Automated Summarization, + https://arxiv.org/abs/1602.03606 + + +Data +---- -Data: ------ .. data:: INPUT_MIN_LENGTH - Minimal number of sentences in text -.. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller -weights set to zero. +.. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller weights set to zero. Example ------- @@ -44,11 +48,6 @@ I've promised her sweets and a ride in the train, And it's lovely rice pudding for dinner again! - -.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). -Variations of the Similarity Function of TextRank for Automated Summarization, -https://arxiv.org/abs/1602.03606 - """ import logging diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index ffaecb2217..f1c2d16d6e 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -5,6 +5,18 @@ """This module contains functions and processors used for processing text, extracting sentences from text, working with acronyms and abbreviations. + +Data +---- + +.. data:: SEPARATOR - Special separator used in abbreviations. +.. data:: RE_SENTENCE - Pattern to split text to sentences. +.. data:: AB_SENIOR - Pattern for detecting abbreviations (example: Sgt. Pepper). +.. data:: AB_ACRONYM - Pattern for detecting acronyms. +.. data:: AB_ACRONYM_LETTERS - Pattern for detecting acronyms (example: P.S. I love you). +.. data:: UNDO_AB_SENIOR - Pattern like AB_SENIOR but with SEPARATOR between abbreviation and next word. +.. data:: UNDO_AB_ACRONYM - Pattern like AB_ACRONYM but with SEPARATOR between abbreviation and next word. + """ @@ -15,7 +27,7 @@ import re import logging -logger = logging.getLogger('summa.preprocessing.cleaner') +logger = logging.getLogger('summarizer.preprocessing.cleaner') try: from pattern.en import tag @@ -27,19 +39,12 @@ SEPARATOR = r'@' -"""str: special separator used in abbreviations.""" RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) -"""SRE_Pattern: pattern to split text to sentences.""" AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) -"""SRE_Pattern: pattern for detecting abbreviations. (Example: Sgt. Pepper)""" AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) -"""SRE_Pattern: one more pattern for detecting acronyms.""" AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE) -"""SRE_Pattern: one more pattern for detecting acronyms. (Example: P.S. I love you)""" UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) -"""SRE_Pattern: Pattern like AB_SENIOR but with SEPARATOR between abbreviation and next word""" UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) -"""SRE_Pattern: Pattern like AB_ACRONYM but with SEPARATOR between abbreviation and next word""" def split_sentences(text): From e2a2e60b3ff82e28ed9aa3671bf6d1dcf5cc7a4a Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 23:16:23 +0500 Subject: [PATCH 15/27] fix bm25 --- gensim/summarization/bm25.py | 66 ++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index bac2d8ed55..ec484949cf 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -14,25 +14,23 @@ -Example -------- ->>> import numpy as np +Examples +-------- >>> from gensim.summarization.bm25 import get_bm25_weights >>> corpus = [ ... ["black", "cat", "white", "cat"], ... ["cat", "outer", "space"], ... ["wag", "dog"] ... ] ->>> np.round(get_bm25_weights(corpus), 3) -array([[ 1.282, 0.182, 0. ], - [ 0.13 , 1.113, 0. ], - [ 0. , 0. , 1.022]]) +>>> result = get_bm25_weights(corpus) + Data: ----- -.. data:: PARAM_K1 - free smoothing parameter for BM25. -.. data:: PARAM_B - free smoothing parameter for BM25. -.. data:: EPSILON - constant used for negative idf of document in corpus. +.. data:: PARAM_K1 - Free smoothing parameter for BM25. +.. data:: PARAM_B - Free smoothing parameter for BM25. +.. data:: EPSILON - Constant used for negative idf of document in corpus. + """ @@ -55,27 +53,23 @@ class BM25(object): Size of corpus (number of documents). avgdl : float Average length of document in `corpus`. - corpus : list of (list of str) + corpus : list of list of str Corpus of documents. - f : list of dict - Dictionary with terms frequencies for each document in `corpus`. Words - used as keys and frequencies as values. + f : list of dicts of int + Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values. df : dict - Dictionary with terms frequencies for whole `corpus`. Words used as keys - and frequencies as values. + Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values. idf : dict - Dictionary with inversed terms frequencies for whole `corpus`. Words - used as keys and frequencies as values. + Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values. """ def __init__(self, corpus): - """Presets atributes and runs initialize() function. - + """ Parameters ---------- - corpus : list of (list of str) - Given corups. + corpus : list of list of str + Given corpus. """ self.corpus_size = len(corpus) @@ -87,10 +81,7 @@ def __init__(self, corpus): self.initialize() def initialize(self): - """Calculates frequencies of terms in documents and in corpus. Also - computes inverse document frequencies. - - """ + """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies.""" for document in self.corpus: frequencies = {} for word in document: @@ -108,14 +99,13 @@ def initialize(self): self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) def get_score(self, document, index, average_idf): - """Computes BM25 score of given `document` in relation to item of corpus - selected by `index`. + """Computes BM25 score of given `document` in relation to item of corpus selected by `index`. Parameters ---------- document : list of str Document to be scored. - index : integer + index : int Index of document in corpus selected to score with `document`. average_idf : float Average idf in corpus. @@ -160,19 +150,29 @@ def get_scores(self, document, average_idf): def get_bm25_weights(corpus): - """Returns BM25 scores (weights) of documents in corpus. Each document - has to be weighted with every document in given corpus. + """Returns BM25 scores (weights) of documents in corpus. + Each document has to be weighted with every document in given corpus. Parameters ---------- - corpus : list of (list of str) + corpus : list of list of str Corpus of documents. Returns ------- - list of (list of float) + list of list of float BM25 scores. + Examples + -------- + >>> from gensim.summarization.bm25 import get_bm25_weights + >>> corpus = [ + ... ["black", "cat", "white", "cat"], + ... ["cat", "outer", "space"], + ... ["wag", "dog"] + ... ] + >>> result = get_bm25_weights(corpus) + """ bm25 = BM25(corpus) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) From d7056e405e6c764c947fa1e4ca800218a38a4feb Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 23:39:45 +0500 Subject: [PATCH 16/27] fix graph --- gensim/summarization/graph.py | 66 +++++++++++++++++------------------ 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index 4509878b74..cd73673d20 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -16,7 +16,7 @@ class Graph (based on IGraph) which implements undirected graph. >>> g.add_node('Lion') >>> g.add_node('Tiger') >>> g.add_node('Wolf') ->>> g.nodes() +>>> sorted(g.nodes()) ['Felidae', 'Lion', 'Tiger', 'Wolf'] Add some edges and check neighbours. @@ -44,8 +44,7 @@ class IGraph(object): @abstractmethod def nodes(self): - """ - Returns all nodes of graph. + """Returns all nodes of graph. Returns ------- @@ -57,12 +56,11 @@ def nodes(self): @abstractmethod def edges(self): - """ - Returns all edges of graph. + """Returns all edges of graph. Returns ------- - list of (tuple of hashable) + list of (hashable, hashable) Edges of graph. """ @@ -70,8 +68,7 @@ def edges(self): @abstractmethod def neighbors(self, node): - """ - Return all nodes that are directly accessible from given node. + """Return all nodes that are directly accessible from given node. Parameters ---------- @@ -109,9 +106,8 @@ def add_node(self, node, attrs=None): Note ---- - While nodes can be of any type, it's strongly recommended - to use only numbers and single-line strings as node identifiers if you - intend to use write(). + While nodes can be of any type, it's strongly recommended to use only numbers and single-line strings + as node identifiers if you intend to use write(). Parameters ---------- @@ -130,7 +126,7 @@ def add_edge(self, edge, wt=1, label='', attrs=None): Parameters ---------- - edge : tuple of hashable + edge : (hashable, hashable) Given edge. wt : float, optional Weight of new edge. @@ -148,8 +144,8 @@ def has_edge(self, edge): Parameters ---------- - edge : tuple of hashable - Given edge. An edge, here, is a tuple of two nodes. + edge : (hashable, hashable) + Given edge. Returns ------- @@ -165,7 +161,7 @@ def edge_weight(self, edge): Parameters ---------- - edge : tuple of hashable + edge : (hashable, hashable) Given edge. Returns @@ -195,13 +191,13 @@ class Graph(IGraph): Attributes ---------- - WEIGHT_ATTRIBUTE_NAME : str + Graph.WEIGHT_ATTRIBUTE_NAME : str Name of weight attribute in graph. - DEFAULT_WEIGHT : float + Graph.DEFAULT_WEIGHT : float Weight set by default. - LABEL_ATTRIBUTE_NAME : str + Graph.LABEL_ATTRIBUTE_NAME : str Default name of attribute. Not used. - DEFAULT_LABEL : str + Graph.DEFAULT_LABEL : str Label set by default. Not used. """ @@ -232,7 +228,7 @@ def has_edge(self, edge): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. Returns @@ -245,11 +241,11 @@ def has_edge(self, edge): return (u, v) in self.edge_properties and (v, u) in self.edge_properties def edge_weight(self, edge): - """Returns weigth of given edge. + """Returns weight of given edge. Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. Returns @@ -293,12 +289,11 @@ def has_node(self, node): return node in self.node_neighbors def add_edge(self, edge, wt=1, label='', attrs=None): - """Adds an edge to the graph connecting two nodes. An edge, here, - is a tuple of two nodes. + """Adds an edge to the graph connecting two nodes. Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. wt : float, optional Weight of new edge. @@ -339,7 +334,7 @@ def add_node(self, node, attrs=None): ---------- node : hashable Given node. - attrs : list, optional + attrs : list of (hashable, hashable), optional Node attributes specified as (attribute, value) Raises @@ -372,7 +367,7 @@ def edges(self): Returns ------- - list of (tuple of hashable, size = 2) + list of (hashable, hashable) Edges of graph. """ @@ -399,7 +394,7 @@ def get_edge_properties(self, edge): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. Returns @@ -411,11 +406,11 @@ def get_edge_properties(self, edge): return self.edge_properties.setdefault(edge, {}) def add_edge_attributes(self, edge, attrs): - """Adds attributes `attrs` to given edge. Order of nodes in edge doesn't matter. + """Adds attributes `attrs` to given edge, order of nodes in edge doesn't matter. Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. attrs : list Provided attributes to add. @@ -425,7 +420,7 @@ def add_edge_attributes(self, edge, attrs): self.add_edge_attribute(edge, attr) def add_edge_attribute(self, edge, attr): - """Adds attribute `attr` to given edge. Order of nodes in edge doesn't matter. + """Adds attribute `attr` to given edge, order of nodes in edge doesn't matter. Parameters ---------- @@ -442,7 +437,11 @@ def add_edge_attribute(self, edge, attr): self.edge_attr[(edge[1], edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] def edge_attributes(self, edge): - """Returns attributes of given edge. In case of non existing edge returns empty list. + """Returns attributes of given edge. + + Note + ---- + In case of non existing edge returns empty list. Parameters ---------- @@ -461,8 +460,7 @@ def edge_attributes(self, edge): return [] def set_edge_properties(self, edge, **properties): - """Adds `properties` to given edge. Order of nodes in edge doesn't - matter. + """Adds `properties` to given edge, order of nodes in edge doesn't matter. Parameters ---------- From 400966c4a36e40cbd9d0b23dd29722dc82db019e Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 23:41:30 +0500 Subject: [PATCH 17/27] fix graph[2] --- gensim/summarization/graph.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py index cd73673d20..79cd1a160f 100644 --- a/gensim/summarization/graph.py +++ b/gensim/summarization/graph.py @@ -424,7 +424,7 @@ def add_edge_attribute(self, edge, attr): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. attr : object @@ -445,7 +445,7 @@ def edge_attributes(self, edge): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. Returns @@ -464,7 +464,7 @@ def set_edge_properties(self, edge, **properties): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. properties : dict @@ -480,7 +480,7 @@ def del_edge(self, edge): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. """ @@ -496,7 +496,7 @@ def del_edge_labeling(self, edge): Parameters ---------- - edge : tuple of hashable, size = 2 + edge : (hashable, hashable) Given edge. """ From 44f617c2b307d06f27032f43796d1d50ea56ea1f Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 23:44:27 +0500 Subject: [PATCH 18/27] fix commons --- gensim/summarization/commons.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py index b202c4f4e2..f1a2264e46 100644 --- a/gensim/summarization/commons.py +++ b/gensim/summarization/commons.py @@ -3,8 +3,7 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module provides functions of creating graph from sequence of values and -removing of unreachable nodes. +"""This module provides functions of creating graph from sequence of values and removing of unreachable nodes. Examples @@ -15,13 +14,13 @@ >>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf']) >>> gg.add_edge(("Felidae", "Lion")) >>> gg.add_edge(("Felidae", "Tiger")) ->>> gg.nodes() +>>> sorted(gg.nodes()) ['Felidae', 'Lion', 'Tiger', 'Wolf'] Remove nodes with no edges. >>> remove_unreachable_nodes(gg) ->>> gg.nodes() +>>> sorted(gg.nodes()) ['Felidae', 'Lion', 'Tiger'] """ @@ -39,7 +38,7 @@ def build_graph(sequence): Returns ------- - :class:~gensim.summarization.graph.Graph + :class:`~gensim.summarization.graph.Graph` Created graph. """ @@ -55,7 +54,7 @@ def remove_unreachable_nodes(graph): Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. """ From d2fed6c9ad3c98c6f43b819570420196291cb8d8 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 03:58:29 +0500 Subject: [PATCH 19/27] fix keywords --- gensim/summarization/keywords.py | 148 ++++++++++++++----------------- 1 file changed, 65 insertions(+), 83 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 54f08fbf0d..7d12ec9e40 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -3,42 +3,32 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains functions to find keywords of the text and building -graph on tokens from text. - +"""This module contains functions to find keywords of the text and building graph on tokens from text. Examples -------- +Extract keywords from text + >>> from gensim.summarization import keywords >>> text='''Challenges in natural language processing frequently involve ... speech recognition, natural language understanding, natural language ... generation (frequently from formal, machine-readable logical forms), ... connecting language and machine perception, dialog systems, or some ... combination thereof.''' ->>> print(gensim.summarization.keywords(text)) -natural language -machine -frequently - - ->>> from gensim.summarization.keywords import get_graph ->>> text = '''Fly me to the moon -... Let me play among the stars -... Let me see what spring is like -... On a, Jupiter and Mars''' ->>> g = get_graph(text) ->>> print(g.nodes()) -['fly', 'moon', 'let', 'plai', 'star', 'spring', 'like', 'jupit', 'mar'] ->>> print(g.neighbors("let")) -['moon', 'star'] +>>> keywords(text).split('\\n') +[u'natural language', u'machine', u'frequently'] +Notes +----- +Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters +for `INCLUDING_FILTER` and `EXCLUDING_FILTER` Data: ----- -.. data:: WINDOW_SIZE - Size of window, number of consequtive tokens in processing. -.. data:: INCLUDING_FILTER - including part of speech filters. -.. data:: EXCLUDING_FILTER - excluding part of speech filters. +.. data:: WINDOW_SIZE - Size of window, number of consecutive tokens in processing. +.. data:: INCLUDING_FILTER - Including part of speech filters. +.. data:: EXCLUDING_FILTER - Excluding part of speech filters. """ @@ -56,21 +46,16 @@ WINDOW_SIZE = 2 -""" -Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters -Example: filter for nouns and adjectives: -INCLUDING_FILTER = ['NN', 'JJ'] -""" INCLUDING_FILTER = ['NN', 'JJ'] EXCLUDING_FILTER = [] def _get_pos_filters(): - """Returns default including and excluding filters as frozen sets. + """Get default including and excluding filters as frozen sets. Returns ------- - tuple of frozenset + (frozenset of str, frozenset of str) Including and excluding filters. """ @@ -78,20 +63,19 @@ def _get_pos_filters(): def _get_words_for_graph(tokens, pos_filter=None): - """Filters given dictionary of tokens using provided part of speech filters - and returns appropriate list of words. + """Filters given dictionary of tokens using provided part of speech filters. Parameters ---------- tokens : dict Original units (words) as keys and processed units (tokens) as values. - pos_filter : tuple of list - Part of speech filters, optional. + pos_filter : iterable + Part of speech filters, optional. If `None` - using :func:`_get_pos_filters`. Returns ------- - list - Filtered words. + list of str + Filtered tokens. Raises ------ @@ -117,26 +101,24 @@ def _get_words_for_graph(tokens, pos_filter=None): def _get_first_window(split_text): - """Returns first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from - given Splited text. + """Get first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given `split_text`. Parameters ---------- - split_text : list - Given Splited text. + split_text : list of str + Splitted text. Returns ------- - tuple of frozenset - Including and excluding filters. + list of str + First :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens. """ return split_text[:WINDOW_SIZE] def _set_graph_edge(graph, tokens, word_a, word_b): - """Sets an edge between nodes named word_a and word_b if they exists in - `tokens` and `graph`, inplace. + """Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace. Parameters ---------- @@ -170,7 +152,7 @@ def _process_first_window(graph, tokens, split_text): tokens : dict Original units (words) as keys and processed units (tokens) as values. split_text : list of str - Splited text. + Splitted text. """ first_window = _get_first_window(split_text) @@ -179,12 +161,12 @@ def _process_first_window(graph, tokens, split_text): def _init_queue(split_text): - """Initializies queue by first words from `split_text`. + """Initialize queue by first words from `split_text`. Parameters ---------- split_text : list of str - Splited text. + Splitted text. Returns ------- @@ -205,7 +187,7 @@ def _process_word(graph, tokens, queue, word): Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. tokens : dict Original units (words) as keys and processed units (tokens) as values. @@ -228,6 +210,7 @@ def _update_queue(queue, word): Given queue. word : str Word to be added to queue. + """ queue.get() queue.put(word) @@ -235,18 +218,19 @@ def _update_queue(queue, word): def _process_text(graph, tokens, split_text): - """Processes `split_text` by updating given `graph` with new eges between - nodes if they exists in `tokens` and `graph`. Words are taken from - `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. + """Process `split_text` by updating given `graph` with new eges between nodes + if they exists in `tokens` and `graph`. + Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. tokens : dict Original units (words) as keys and processed units (tokens) as values. split_text : list of str - Splited text. + Splitted text. + """ queue = _init_queue(split_text) for i in xrange(WINDOW_SIZE, len(split_text)): @@ -277,9 +261,8 @@ def _queue_iterator(queue): def _set_graph_edges(graph, tokens, split_text): - """Updates given `graph` by setting eges between nodes if they exists in - `tokens` and `graph`. Words are taken from `split_text` with window size - :const:`~gensim.parsing.keywords.WINDOW_SIZE`. + """Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`. + Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. Parameters ---------- @@ -288,19 +271,19 @@ def _set_graph_edges(graph, tokens, split_text): tokens : dict Original units (words) as keys and processed units (tokens) as values. split_text : list of str - Splited text. + Splitted text. + """ _process_first_window(graph, tokens, split_text) _process_text(graph, tokens, split_text) def _extract_tokens(lemmas, scores, ratio, words): - """Extracts tokens from provided lemmas. Most scored lemmas are used if - `words` not provided. + """Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided. Parameters ---------- - lemmas : list + lemmas : list of str Given lemmas. scores : dict Dictionary with lemmas and its scores. @@ -312,7 +295,7 @@ def _extract_tokens(lemmas, scores, ratio, words): Returns ------- - list of (tuple of float and str) + list of (float, str) Scores and corresponded lemmas. """ @@ -322,7 +305,7 @@ def _extract_tokens(lemmas, scores, ratio, words): def _lemmas_to_words(tokens): - """Returns words and lemmas from given tokens. Produces "reversed" `tokens`. + """Get words and lemmas from given tokens. Produces "reversed" `tokens`. Parameters ---------- @@ -346,14 +329,13 @@ def _lemmas_to_words(tokens): def _get_keywords_with_score(extracted_lemmas, lemma_to_word): - """Returns words of `extracted_lemmas` and its scores. Words contains in - `lemma_to_word`. + """Get words of `extracted_lemmas` and its scores, words contains in `lemma_to_word`. Parameters ---------- - extracted_lemmas : list of tuples - Given lemmas. - lemma_to_word : dict of {lemma:list of words} + extracted_lemmas : list of (float, str) + Given lemmas with scores + lemma_to_word : dict Lemmas and corresponding words. Returns @@ -372,7 +354,7 @@ def _get_keywords_with_score(extracted_lemmas, lemma_to_word): def _strip_word(word): - """Return cleaned `word`. + """Get cleaned `word`. Parameters ---------- @@ -389,15 +371,14 @@ def _strip_word(word): def _get_combined_keywords(_keywords, split_text): - """Returns most scored words (`_keywords`) contained in `split_text` and its - combinations. + """Get most scored words (`_keywords`) contained in `split_text` and it's combinations. Parameters ---------- _keywords : dict Keywords as keys and its scores as values. split_text : list of str - Splited text. + Splitted text. Returns ------- @@ -427,7 +408,7 @@ def _get_combined_keywords(_keywords, split_text): def _get_average_score(concept, _keywords): - """Returns average score of words in `concept`. + """Get average score of words in `concept`. Parameters ---------- @@ -468,8 +449,8 @@ def _format_results(_keywords, combined_keywords, split, scores): Returns ------- - str or list of str or list of (tuple of str) - Formated `combined_keywords`. + list of (str, float) If `scores` **OR** list of str if `split` **OR** str + Keywords in needed format. """ combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) @@ -482,15 +463,16 @@ def _format_results(_keywords, combined_keywords, split, scores): def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): - """Returns most ranked words of provided text and/or its combinations . + """Get most ranked words of provided text and/or its combinations. Parameters ---------- + text : str - Sequence of values. + Input text. ratio : float, optional - If no "words" option is selected, the number of sentences is - reduced by the provided ratio, else, the ratio is ignored. + If no "words" option is selected, the number of sentences is reduced by the provided ratio, + else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional @@ -500,13 +482,14 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional - Lemmatize words if True. + If True - lemmatize words. deacc : bool, optional - Remove accentuation if True. + If True - remove accentuation. Returns ------- - str or list of str or list of (tuple of str) + list of (str, float) If `scores` **OR** list of str if `split` **OR** str + Keywords in needed format. """ # Gets a dict of word -> lemma @@ -543,8 +526,7 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= def get_graph(text): - """Creates and returns graph from given text. Cleans and tokenizes text - before building graph. + """Creates and returns graph from given text, cleans and tokenize text before building graph. Parameters ---------- @@ -553,7 +535,7 @@ def get_graph(text): Returns ------- - :class:~gensim.summarization.graph.Graph + :class:`~gensim.summarization.graph.Graph` Created graph. """ From 84b0f3a139a4457f8e1e435b1f0bca31854a7cc4 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 04:07:58 +0500 Subject: [PATCH 20/27] fix keywords[2] --- gensim/summarization/keywords.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 7d12ec9e40..4074088a04 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -449,8 +449,12 @@ def _format_results(_keywords, combined_keywords, split, scores): Returns ------- - list of (str, float) If `scores` **OR** list of str if `split` **OR** str - Keywords in needed format. + result: list of (str, float) + If `scores`, keywords with scores **OR** + result: list of str + If `split`, keywords only **OR** + result: str + Keywords, joined by endl. """ combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) @@ -488,8 +492,12 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= Returns ------- - list of (str, float) If `scores` **OR** list of str if `split` **OR** str - Keywords in needed format. + result: list of (str, float) + If `scores`, keywords with scores **OR** + result: list of str + If `split`, keywords only **OR** + result: str + Keywords, joined by endl. """ # Gets a dict of word -> lemma From ba8b1b633699d0cf260c611bc968d57260fd827f Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 04:12:09 +0500 Subject: [PATCH 21/27] fix mz_entropy --- docs/src/apiref.rst | 1 + docs/src/summarization/mz_entropy.rst | 9 +++++++ gensim/summarization/mz_entropy.py | 39 ++++++++++++--------------- 3 files changed, 27 insertions(+), 22 deletions(-) create mode 100644 docs/src/summarization/mz_entropy.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index c4587f14f9..d67d27a16a 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -88,6 +88,7 @@ Modules: summarization/commons summarization/graph summarization/keywords + summarization/mz_entropy summarization/pagerank_weighted summarization/summariser summarization/syntactic_unit diff --git a/docs/src/summarization/mz_entropy.rst b/docs/src/summarization/mz_entropy.rst new file mode 100644 index 0000000000..31222ca6ab --- /dev/null +++ b/docs/src/summarization/mz_entropy.rst @@ -0,0 +1,9 @@ +:mod:`summarization.mz_entropy` -- Keywords for the Montemurro and Zanette entropy algorithm +============================================================================================ + +.. automodule:: gensim.summarization.mz_entropy + :synopsis: Keywords for the Montemurro and Zanette entropy algorithm + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index b9c5c02f33..11437f5c86 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -11,52 +11,47 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): - """Extract keywords from text using the Montemurro and Zanette entropy - algorithm. [1]_ + """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- text: str - document to summarize + Document for summarization. blocksize: int, optional - size of blocks to use in analysis, default is 1024 + Size of blocks to use in analysis. scores: bool, optional - Whether to return score with keywords, default is False + Whether to return score with keywords. split: bool, optional - Whether to return results as list, default is False + Whether to return results as list. weighted: bool, optional - Whether to weight scores by word frequency. Default is True. - False can useful for shorter texts, and allows automatic thresholding + Whether to weight scores by word frequency. + False can useful for shorter texts, and allows automatic thresholding. threshold: float or 'auto', optional - minimum score for returned keywords, default 0.0 - 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1.0e-8 - Use 'auto' with weighted=False) + Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, + use 'auto' with `weighted=False`. Returns ------- results: str - newline separated keywords if `split` == False OR + newline separated keywords if `split` == False **OR** results: list(str) - list of keywords if `scores` == False OR + list of keywords if `scores` == False **OR** results: list(tuple(str, float)) list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. - Notes - ----- + Note + ---- This algorithm looks for keywords that contribute to the structure of the - text on scales of blocksize words of larger. It is suitable for extracting + text on scales of `blocksize` words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- - [1] Marcello A Montemurro, Damian Zanette, - "Towards the quantification of the semantic information encoded in - written language" - Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153 - DOI: 10.1142/S0219525910002530 - https://arxiv.org/abs/0907.1558 + .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in + written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, + DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 """ text = to_unicode(text) From 2a283d7294919a24a6d0e629e3fa5cb42cee02ae Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 13:00:12 +0500 Subject: [PATCH 22/27] fix pagerank_weighted --- gensim/summarization/pagerank_weighted.py | 59 +++++++++++------------ 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 68410fbc66..df1352367c 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -3,21 +3,23 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" +"""This module calculate PageRank [1]_ based on wordgraph. + +.. [1] https://en.wikipedia.org/wiki/PageRank Examples -------- +Calculate Pagerank for words + >>> from gensim.summarization.keywords import get_graph >>> from gensim.summarization.pagerank_weighted import pagerank_weighted >>> graph = get_graph("The road to hell is paved with good intentions.") ->>> pagerank_weighted(graph) -{'good': 0.70432858653171504, - 'hell': 0.051128871128006126, - 'intent': 0.70432858653171504, - 'pave': 0.051128871128006015, - 'road': 0.051128871128006237} +>>> # result will looks like {'good': 0.70432858653171504, 'hell': 0.051128871128006126, ...} +>>> result = pagerank_weighted(graph) + +Build matrix from graph >>> from gensim.summarization.pagerank_weighted import build_adjacency_matrix >>> build_adjacency_matrix(graph).todense() @@ -37,20 +39,13 @@ from scipy.sparse.linalg import eigs from six.moves import xrange -try: - from numpy import VisibleDeprecationWarning - import warnings - warnings.filterwarnings("ignore", category=VisibleDeprecationWarning) -except ImportError: - pass - def pagerank_weighted(graph, damping=0.85): - """Returns dictionary of `graph`'s nodes and its ranks. + """Get dictionary of `graph` nodes and its ranks. Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. damping : float Damping parameter, optional @@ -73,17 +68,17 @@ def pagerank_weighted(graph, damping=0.85): def build_adjacency_matrix(graph): - """Returns matrix representation of given `graph`. + """Get matrix representation of given `graph`. Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. Returns ------- - :class:scipy.sparse.csr_matrix, shape = [n, n], n is number of nodes - Adjacency matrix of given `graph`. + :class:`scipy.sparse.csr_matrix`, shape = [n, n] + Adjacency matrix of given `graph`, n is number of nodes. """ row = [] @@ -106,18 +101,18 @@ def build_adjacency_matrix(graph): def build_probability_matrix(graph): - """Returns square matrix of shape (n, n), where n is number of nodes of the + """Get square matrix of shape (n, n), where n is number of nodes of the given `graph`. Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. Returns ------- - array, shape = [n, n], n is number of nodes of `graph` - Eigenvector of matrix `a`. + numpy.ndarray, shape = [n, n] + Eigenvector of matrix `a`, n is number of nodes of `graph`. """ dimension = len(graph.nodes()) @@ -130,16 +125,16 @@ def build_probability_matrix(graph): def principal_eigenvector(a): - """Returns eigenvector of square matrix `a`. + """Get eigenvector of square matrix `a`. Parameters ---------- - a : array, shape = [n, n] + a : numpy.ndarray, shape = [n, n] Given matrix. Returns ------- - array, shape = [n, ] + numpy.ndarray, shape = [n, ] Eigenvector of matrix `a`. """ @@ -157,15 +152,15 @@ def principal_eigenvector(a): def process_results(graph, vec): - """Returns `graph` nodes and corresponding absolute values of provided - eigenvector. This function os helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted` + """Get `graph` nodes and corresponding absolute values of provided eigenvector. + This function is helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted` Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. - vec : array, shape = [n, ], n is number of nodes of `graph` - Given eigenvector. + vec : numpy.ndarray, shape = [n, ] + Given eigenvector, n is number of nodes of `graph`. Returns ------- From 6bd1584662d7cddc4f8882dd3c0ab5813297221e Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 13:01:36 +0500 Subject: [PATCH 23/27] fix graph rst --- docs/src/summarization/graph.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/summarization/graph.rst b/docs/src/summarization/graph.rst index 909b15cf5e..eb3588077d 100644 --- a/docs/src/summarization/graph.rst +++ b/docs/src/summarization/graph.rst @@ -1,8 +1,8 @@ -:mod:`summarization.graph` -- TextRank graph -========================================================= +:mod:`summarization.graph` -- Graph +=================================== .. automodule:: gensim.summarization.graph - :synopsis: TextRank graph + :synopsis: Graph :members: :inherited-members: :undoc-members: From 7ec89faf7d56ec757328ba38172a7f8c8c0f0f1d Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 13:37:32 +0500 Subject: [PATCH 24/27] fix summarizer --- gensim/summarization/summarizer.py | 130 ++++++++++++++--------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 21f7d5c944..afeb359b15 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -4,7 +4,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module provides functions for summarizing texts. Summarizing is based on -ranks of text sentences using a variation of the TextRank algorithm (see [1]_ ). +ranks of text sentences using a variation of the TextRank algorithm [1]_. .. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). Variations of the Similarity Function of TextRank for Automated Summarization, @@ -51,6 +51,7 @@ """ import logging +from gensim.utils import deprecated from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from gensim.summarization.commons import build_graph as _build_graph @@ -69,12 +70,12 @@ def _set_graph_edge_weights(graph): - """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all - weights are fairly small forces all weights to 1, inplace. + """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small, + forces all weights to 1, inplace. Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. """ @@ -108,7 +109,7 @@ def _create_valid_graph(graph): Parameters ---------- - graph : :class:~gensim.summarization.graph.Graph + graph : :class:`~gensim.summarization.graph.Graph` Given graph. """ @@ -127,8 +128,9 @@ def _create_valid_graph(graph): graph.add_edge(edge, 1) +@deprecated("Function will be removed in 4.0.0") def _get_doc_length(doc): - """Returns length of (tokenized) document. + """Get length of (tokenized) document. Parameters ---------- @@ -144,6 +146,7 @@ def _get_doc_length(doc): return sum([item[1] for item in doc]) +@deprecated("Function will be removed in 4.0.0") def _get_similarity(doc1, doc2, vec1, vec2): """Returns similarity of two documents. @@ -174,16 +177,16 @@ def _get_similarity(doc1, doc2, vec1, vec2): def _build_corpus(sentences): - """Returns built corpeus from provided sentences. + """Construct corpus from provided sentences. Parameters ---------- - sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - Given senteces. + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. Returns ------- - list of (list of (tuple of int)) + list of list of (int, int) Corpus built from sentences. """ @@ -193,20 +196,20 @@ def _build_corpus(sentences): def _get_important_sentences(sentences, corpus, important_docs): - """Returns most important sentences. + """Get most important sentences. Parameters ---------- - sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - Given senteces. - corpus : list of (list of (tuple of int)) + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. + corpus : list of list of (int, int) Provided corpus. - important_docs : list of (list of (tuple of int)) + important_docs : list of list of (int, int) Most important documents of the corpus. Returns ------- - list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Most important sentences. """ @@ -216,20 +219,18 @@ def _get_important_sentences(sentences, corpus, important_docs): def _get_sentences_with_word_count(sentences, word_count): - """Returns list of sentences. Total number of returned words close to - specified `word_count`. + """Get list of sentences. Total number of returned words close to specified `word_count`. Parameters ---------- - sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - Given senteces. + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. word_count : int or None - Number of returned words. If None full most important sentences will be - returned. + Number of returned words. If None full most important sentences will be returned. Returns ------- - list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Most important sentences. """ @@ -252,23 +253,22 @@ def _get_sentences_with_word_count(sentences, word_count): def _extract_important_sentences(sentences, corpus, important_docs, word_count): - """Returns most important sentences of the `corpus`. + """Get most important sentences of the `corpus`. Parameters ---------- - sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - Given senteces. - corpus : list of (list of (tuple of int)) + sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` + Given sentences. + corpus : list of list of (int, int) Provided corpus. - important_docs : list of (list of (tuple of int)) + important_docs : list of list of (int, int) Most important docs of the corpus. word_count : int - Number of returned words. If None full most important sentences will be - returned. + Number of returned words. If None full most important sentences will be returned. Returns ------- - list :class:~gensim.summarization.syntactic_unit.SyntacticUnit + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Most important sentences. """ @@ -287,15 +287,16 @@ def _format_results(extracted_sentences, split): Parameters ---------- extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - Given senteces. + Given sentences. split : bool - If True senteces will be returned as list. Otherwise senteces will be - merged and returned as string. + If True sentences will be returned as list. Otherwise sentences will be merged and returned as string. Returns ------- - str or list of str - Formated result. + list of str + If `split` **OR** + str + Formatted result. """ if split: @@ -304,16 +305,16 @@ def _format_results(extracted_sentences, split): def _build_hasheable_corpus(corpus): - """Hashes and returns `corpus`. + """Hashes and get `corpus`. Parameters ---------- - corpus : list of (list of (tuple of int)) + corpus : list of list of (int, int) Given corpus. Returns ------- - list of (tuple of (tuple of int)) + list of list of (int, int) Hashable corpus. """ @@ -321,29 +322,27 @@ def _build_hasheable_corpus(corpus): def summarize_corpus(corpus, ratio=0.2): - """Returns a list of the most important documents of a corpus using a - variation of the TextRank algorithm. Used as helper for summarize - :func:`~gensim.summarization.summarizer.summarizer` + """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. + Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` + + Note + ---- + The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary + to make sense. - The input must have at least - :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. - The length of the output can be specified using the ratio parameter, - which determines how many documents will be chosen for the summary - (defaults at 20% of the number of documents of the corpus). Parameters ---------- - corpus : list of (list of (tuple of int)) + corpus : list of list of (int, int) Given corpus. - ratio : float + ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- - str or list of str - Most important documents of given `corpus` sorted by the document score, - highest first. + list of str + Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) @@ -375,37 +374,38 @@ def summarize_corpus(corpus, ratio=0.2): def summarize(text, ratio=0.2, word_count=None, split=False): - """Returns a summarized version of the given text. + """Get a summarized version of the given text. The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. - The input should be a string, and must be longer than - :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for - the summary to make sense. The text - will be split into sentences using the split_sentences method in the - summarization.texcleaner module. Note that newlines divide sentences. + Note + ---- + The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` + sentences for the summary to make sense. + The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner` + module. Note that newlines divide sentences. - The length of the output can be specified using the ratio and - `word_count` parameters. Parameters ---------- text : str Given text. - ratio : float + ratio : float, optional Number between 0 and 1 that determines the proportion of the number of - sentences of the original text to be chosen for the summary. Optional. - word_count : int + sentences of the original text to be chosen for the summary. + word_count : int or None, optional Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. - split : bool + split : bool, optional If True, list of sentences will be returned. Otherwise joined strings will bwe returned. Returns ------- - str or list of str + list of str + If `split` **OR** + str Most representative sentences of given the text. """ From fa5efce9c082741f2938d75be572aad7aca3c847 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 13:43:08 +0500 Subject: [PATCH 25/27] fix syntactic_unit --- gensim/summarization/syntactic_unit.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py index 537e403723..335ee6a212 100644 --- a/gensim/summarization/syntactic_unit.py +++ b/gensim/summarization/syntactic_unit.py @@ -3,15 +3,8 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains implementation of SyntacticUnit class. It generally used -while text cleaning. SyntacticUnit represents printable version of provided -text. - - -Example -------- ->>> print(SyntacticUnit("Beautiful is better than ugly.", "beauti better ugli")) -Original unit: 'Beautiful is better than ugly.' *-*-*-* Processed unit: 'beauti better ugli' +"""This module contains implementation of SyntacticUnit class. It generally used while text cleaning. +:class:`~gensim.summarization.syntactic_unit.SyntacticUnit` represents printable version of provided text. """ @@ -35,7 +28,7 @@ class SyntacticUnit(object): """ def __init__(self, text, token=None, tag=None): - """Initializates syntactic unit. + """ Parameters ---------- @@ -49,7 +42,7 @@ def __init__(self, text, token=None, tag=None): """ self.text = text self.token = token - self.tag = tag[:2] if tag else None # Just first two letters of tag + self.tag = tag[:2] if tag else None # Just first two letters of tag self.index = -1 self.score = -1 From 0014d88ddcc9f38164c4b97c1eda7f16cbf577c5 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 14:45:03 +0500 Subject: [PATCH 26/27] fix textcleaner --- gensim/summarization/textcleaner.py | 43 ++++++++++++----------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index f1c2d16d6e..5af6bef257 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -48,10 +48,8 @@ def split_sentences(text): - """Splits and returns list of sentences from given text. It preserves - abbreviations set in - :const:`~gensim.summarization.textcleaner.AB_SENIOR` and - :const:`~gensim.summarization.textcleaner.AB_ACRONYM`. + """Split and get list of sentences from given text. It preserves abbreviations set in + :const:`~gensim.summarization.textcleaner.AB_SENIOR` and :const:`~gensim.summarization.textcleaner.AB_ACRONYM`. Parameters ---------- @@ -79,11 +77,11 @@ def split_sentences(text): def replace_abbreviations(text): - """Replaces blank space to '@' separator after abbreviation and next word. + """Replace blank space to '@' separator after abbreviation and next word. Parameters ---------- - sentence : str + text : str Input sentence. Returns @@ -101,7 +99,7 @@ def replace_abbreviations(text): def undo_replacement(sentence): - """Replaces `@` separator back to blank space after each abbreviation. + """Replace `@` separator back to blank space after each abbreviation. Parameters ---------- @@ -123,8 +121,7 @@ def undo_replacement(sentence): def replace_with_separator(text, separator, regexs): - """Returns text with replaced separator if provided regular expressions - were matched. Used as helper in other reaplcers. + """Get text with replaced separator if provided regular expressions were matched. Parameters ---------- @@ -132,7 +129,7 @@ def replace_with_separator(text, separator, regexs): Input text. separator : str The separator between words to be replaced. - regexs : list of _sre.SRE_Pattern + regexs : list of `_sre.SRE_Pattern` Regular expressions used in processing text. Returns @@ -149,8 +146,8 @@ def replace_with_separator(text, separator, regexs): def get_sentences(text): - """Sentence generator from provided text. Sentence pattern set in - :const:`~gensim.summarization.textcleaner.RE_SENTENCE`. + """Sentence generator from provided text. Sentence pattern set + in :const:`~gensim.summarization.textcleaner.RE_SENTENCE`. Parameters ---------- @@ -176,9 +173,8 @@ def get_sentences(text): def merge_syntactic_units(original_units, filtered_units, tags=None): - """Processes given sentences and its filtered (tokenized) copies into - SyntacticUnit type. Also adds tags if they are provided to produced units. - Returns list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. + """Process given sentences and its filtered (tokenized) copies into + :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`. Also adds tags if they are provided to produced units. Parameters ---------- @@ -219,7 +215,7 @@ def join_words(words, separator=" "): words : list of str Given words. separator : str, optional - The separator between elements. Blank space set as default. + The separator between elements. Returns ------- @@ -231,8 +227,7 @@ def join_words(words, separator=" "): def clean_text_by_sentences(text): - """Tokenizes a given text into sentences, applying filters and lemmatizing them. - Returns a list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit. + """Tokenize a given text into sentences, applying filters and lemmatize them. Parameters ---------- @@ -241,7 +236,7 @@ def clean_text_by_sentences(text): Returns ------- - list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit + list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Sentences of the given text. """ @@ -252,9 +247,7 @@ def clean_text_by_sentences(text): def clean_text_by_word(text, deacc=True): - """Tokenizes a given text into words, applying filters and lemmatizing them. - Returns a dictionary with words as keys and :class:~gensim.summarization.syntactic_unit.SyntacticUnit - as values. Note that different words may lead to same processed units. + """Tokenize a given text into words, applying filters and lemmatize them. Parameters ---------- @@ -266,7 +259,7 @@ def clean_text_by_word(text, deacc=True): Returns ------- dict - Words as keys, :class:~gensim.summarization.syntactic_unit.SyntacticUnit as values. + Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. Example ------- @@ -289,10 +282,8 @@ def clean_text_by_word(text, deacc=True): def tokenize_by_word(text): - """Tokenizes input text. Before tokenizing transforms text to lower case and - removes accentuation and acronyms set + """Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. - Returns generator of words. Parameters ---------- From 1a0166ad98b23c1ad7f998ca28eed0bd39be9be5 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 12 Dec 2017 14:45:10 +0500 Subject: [PATCH 27/27] fix --- gensim/parsing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index f0cf22a6e8..cc15b4665b 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -363,7 +363,7 @@ def preprocess_documents(docs): Returns ------- - list of (list of str) + list of list of str Processed documents split by whitespace. Examples