Skip to content

Commit

Permalink
Add verification when summarize_corpus returns null. Fix #1531. (#1570)
Browse files Browse the repository at this point in the history
* Avoid "NoneType is not iterable..." error for few documents in corpus.

* Fix comment.

* Adding relevant test.

* Fixed return types on summarization border cases:

- Returns empty list on border case of summarize_corpus.
- Returns empty string or empty list on border case of summarize.
- Fixed test accordingly.
- Removed some test code repetition.

* Replace `is` to `==`
  • Loading branch information
fbarrios authored and menshikh-iv committed Sep 18, 2017
1 parent 4c0737a commit 02ba343
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 63 deletions.
14 changes: 10 additions & 4 deletions gensim/summarization/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def summarize_corpus(corpus, ratio=0.2):
# If the corpus is empty, the function ends.
if len(corpus) == 0:
logger.warning("Input corpus is empty.")
return
return []

# Warns the user if there are too few documents.
if len(corpus) < INPUT_MIN_LENGTH:
Expand All @@ -157,10 +157,11 @@ def summarize_corpus(corpus, ratio=0.2):
_set_graph_edge_weights(graph)
_remove_unreachable_nodes(graph)

# Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends.
# Cannot calculate eigenvectors if number of unique documents in corpus < 3.
# Warns user to add more text. The function ends.
if len(graph.nodes()) < 3:
logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
return
return []

pagerank_scores = _pagerank(graph)

Expand Down Expand Up @@ -197,7 +198,7 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
# If no sentence could be identified, the function ends.
if len(sentences) == 0:
logger.warning("Input text is empty.")
return
return [] if split else u""

# If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
if len(sentences) == 1:
Expand All @@ -211,6 +212,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False):

most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)

# If couldn't get important docs, the algorithm ends.
if not most_important_docs:
logger.warning("Couldn't get relevant sentences.")
return [] if split else u""

# Extracts the most important sentences with the selected criterion.
extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)

Expand Down
99 changes: 40 additions & 59 deletions gensim/test/test_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,24 @@

class TestSummarizationTest(unittest.TestCase):

def test_text_summarization(self):
def _get_text_from_test_data(self, file):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
with utils.smart_open(os.path.join(pre_path, file), mode="r") as f:
return f.read()

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
def test_text_summarization(self):
text = self._get_text_from_test_data("mihalcea_tarau.txt")

# Makes a summary of the text.
generated_summary = summarize(text)

# To be compared to the method reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
summary = f.read()
summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")

self.assertEqual(generated_summary, summary)

def test_corpus_summarization(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
text = self._get_text_from_test_data("mihalcea_tarau.txt")

# Generate the corpus.
sentences = text.split("\n")
Expand All @@ -54,9 +52,8 @@ def test_corpus_summarization(self):
selected_documents = summarize_corpus(corpus)

# They are compared to the method reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
summary = f.read()
summary = summary.split('\n')
summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")
summary = summary.split('\n')

# Each sentence in the document selection has to be in the model summary.
for doc_number, document in enumerate(selected_documents):
Expand All @@ -67,43 +64,29 @@ def test_corpus_summarization(self):
self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)

def test_summary_from_unrelated_sentences(self):
# Tests that the summarization of a text with unrelated sentences does not raise an exception.
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()

# Tests that the summarization of a text with unrelated sentences is not empty string.
text = self._get_text_from_test_data("testsummarization_unrelated.txt")
generated_summary = summarize(text)
self.assertNotEqual(generated_summary, u"")

self.assertNotEqual(generated_summary, None)

def test_text_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()
def test_text_summarization_on_short_input_text_is_empty_string(self):
text = self._get_text_from_test_data("testsummarization_unrelated.txt")

# Keeps the first 8 sentences to make the text shorter.
text = "\n".join(text.split('\n')[:8])

self.assertTrue(summarize(text) is not None)

def test_text_summarization_returns_input_on_single_input_sentence(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
self.assertNotEqual(summarize(text), u"")

with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()
def test_text_summarization_raises_exception_on_single_input_sentence(self):
text = self._get_text_from_test_data("testsummarization_unrelated.txt")

# Keeps the first sentence only.
text = text.split('\n')[0]

self.assertRaises(ValueError, summarize, text)

def test_corpus_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()
def test_corpus_summarization_is_not_empty_list_on_short_input_text(self):
text = self._get_text_from_test_data("testsummarization_unrelated.txt")

# Keeps the first 8 sentences to make the text shorter.
sentences = text.split('\n')[:8]
Expand All @@ -113,19 +96,19 @@ def test_corpus_summarization_raises_exception_on_short_input_text(self):
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

self.assertTrue(summarize_corpus(corpus) is not None)
self.assertNotEqual(summarize_corpus(corpus), [])

def test_empty_text_summarization_none(self):
self.assertTrue(summarize("") is None)
def test_empty_text_summarization_is_empty_string(self):
self.assertEquals(summarize(""), u"")

def test_empty_corpus_summarization_is_none(self):
self.assertTrue(summarize_corpus([]) is None)
def test_empty_text_summarization_with_split_is_empty_list(self):
self.assertEquals(summarize("", split=True), [])

def test_corpus_summarization_ratio(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
def test_empty_corpus_summarization_is_empty_list(self):
self.assertEquals(summarize_corpus([]), [])

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
def test_corpus_summarization_ratio(self):
text = self._get_text_from_test_data("mihalcea_tarau.txt")

# Generate the corpus.
sentences = text.split('\n')
Expand All @@ -142,10 +125,7 @@ def test_corpus_summarization_ratio(self):
self.assertEqual(len(selected_docs), expected_summary_length)

def test_repeated_keywords(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testrepeatedkeywords.txt")) as f:
text = f.read()
text = self._get_text_from_test_data("testrepeatedkeywords.txt")

kwds = keywords(text)
self.assertTrue(len(kwds.splitlines()))
Expand All @@ -157,10 +137,7 @@ def test_repeated_keywords(self):
self.assertTrue(len(kwds_lst))

def test_keywords_runs(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt")) as f:
text = f.read()
text = self._get_text_from_test_data("mihalcea_tarau.txt")

kwds = keywords(text)
self.assertTrue(len(kwds.splitlines()))
Expand All @@ -171,20 +148,24 @@ def test_keywords_runs(self):
kwds_lst = keywords(text, split=True)
self.assertTrue(len(kwds_lst))

def test_low_distinct_words_corpus_summarization_is_none(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f:
text = f.read()
def test_low_distinct_words_corpus_summarization_is_empty_list(self):
text = self._get_text_from_test_data("testlowdistinctwords.txt")

# Generate the corpus.
sentences = text.split("\n")
tokens = [sentence.split() for sentence in sentences]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

self.assertTrue(summarize_corpus(corpus) is None)
self.assertEquals(summarize_corpus(corpus), [])

def test_low_distinct_words_summarization_is_empty_string(self):
text = self._get_text_from_test_data("testlowdistinctwords.txt")
self.assertEquals(summarize(text), u"")

def test_low_distinct_words_summarization_with_split_is_empty_list(self):
text = self._get_text_from_test_data("testlowdistinctwords.txt")
self.assertEquals(summarize(text, split=True), [])

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
Expand Down

0 comments on commit 02ba343

Please sign in to comment.