Add verification when summarize_corpus returns null. Fix #1531. (#1570)

* Avoid "NoneType is not iterable..." error for few documents in corpus. * Fix comment. * Adding relevant test. * Fixed return types on summarization border cases: - Returns empty list on border case of summarize_corpus. - Returns empty string or empty list on border case of summarize. - Fixed test accordingly. - Removed some test code repetition. * Replace `is` to `==`
piskvorky · Sep 18, 2017 · 02ba343 · 02ba343
1 parent 4c0737a
commit 02ba343
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 63 deletions.
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
@@ -147,7 +147,7 @@ def summarize_corpus(corpus, ratio=0.2):
     # If the corpus is empty, the function ends.
     if len(corpus) == 0:
         logger.warning("Input corpus is empty.")
-        return
+        return []
 
     # Warns the user if there are too few documents.
     if len(corpus) < INPUT_MIN_LENGTH:
@@ -157,10 +157,11 @@ def summarize_corpus(corpus, ratio=0.2):
     _set_graph_edge_weights(graph)
     _remove_unreachable_nodes(graph)
 
-    # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends.
+    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
+    # Warns user to add more text. The function ends.
     if len(graph.nodes()) < 3:
         logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
-        return
+        return []
 
     pagerank_scores = _pagerank(graph)
 
@@ -197,7 +198,7 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
     # If no sentence could be identified, the function ends.
     if len(sentences) == 0:
         logger.warning("Input text is empty.")
-        return
+        return [] if split else u""
 
     # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
     if len(sentences) == 1:
@@ -211,6 +212,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
 
     most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)
 
+    # If couldn't get important docs, the algorithm ends.
+    if not most_important_docs:
+        logger.warning("Couldn't get relevant sentences.")
+        return [] if split else u""
+
     # Extracts the most important sentences with the selected criterion.
     extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)
 

diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py
@@ -23,26 +23,24 @@
 
 class TestSummarizationTest(unittest.TestCase):
 
-    def test_text_summarization(self):
+    def _get_text_from_test_data(self, file):
         pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+        with utils.smart_open(os.path.join(pre_path, file), mode="r") as f:
+            return f.read()
 
-        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
-            text = f.read()
+    def test_text_summarization(self):
+        text = self._get_text_from_test_data("mihalcea_tarau.txt")
 
         # Makes a summary of the text.
         generated_summary = summarize(text)
 
         # To be compared to the method reference.
-        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
-            summary = f.read()
+        summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")
 
         self.assertEqual(generated_summary, summary)
 
     def test_corpus_summarization(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
-            text = f.read()
+        text = self._get_text_from_test_data("mihalcea_tarau.txt")
 
         # Generate the corpus.
         sentences = text.split("\n")
@@ -54,9 +52,8 @@ def test_corpus_summarization(self):
         selected_documents = summarize_corpus(corpus)
 
         # They are compared to the method reference.
-        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
-            summary = f.read()
-            summary = summary.split('\n')
+        summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt")
+        summary = summary.split('\n')
 
         # Each sentence in the document selection has to be in the model summary.
         for doc_number, document in enumerate(selected_documents):
@@ -67,43 +64,29 @@ def test_corpus_summarization(self):
             self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
 
     def test_summary_from_unrelated_sentences(self):
-        # Tests that the summarization of a text with unrelated sentences does not raise an exception.
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
-            text = f.read()
-
+        # Tests that the summarization of a text with unrelated sentences is not empty string.
+        text = self._get_text_from_test_data("testsummarization_unrelated.txt")
         generated_summary = summarize(text)
+        self.assertNotEqual(generated_summary, u"")
 
-        self.assertNotEqual(generated_summary, None)
-
-    def test_text_summarization_raises_exception_on_short_input_text(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
-            text = f.read()
+    def test_text_summarization_on_short_input_text_is_empty_string(self):
+        text = self._get_text_from_test_data("testsummarization_unrelated.txt")
 
         # Keeps the first 8 sentences to make the text shorter.
         text = "\n".join(text.split('\n')[:8])
 
-        self.assertTrue(summarize(text) is not None)
-
-    def test_text_summarization_returns_input_on_single_input_sentence(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+        self.assertNotEqual(summarize(text), u"")
 
-        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
-            text = f.read()
+    def test_text_summarization_raises_exception_on_single_input_sentence(self):
+        text = self._get_text_from_test_data("testsummarization_unrelated.txt")
 
         # Keeps the first sentence only.
         text = text.split('\n')[0]
 
         self.assertRaises(ValueError, summarize, text)
 
-    def test_corpus_summarization_raises_exception_on_short_input_text(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
-            text = f.read()
+    def test_corpus_summarization_is_not_empty_list_on_short_input_text(self):
+        text = self._get_text_from_test_data("testsummarization_unrelated.txt")
 
         # Keeps the first 8 sentences to make the text shorter.
         sentences = text.split('\n')[:8]
@@ -113,19 +96,19 @@ def test_corpus_summarization_raises_exception_on_short_input_text(self):
         dictionary = Dictionary(tokens)
         corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]
 
-        self.assertTrue(summarize_corpus(corpus) is not None)
+        self.assertNotEqual(summarize_corpus(corpus), [])
 
-    def test_empty_text_summarization_none(self):
-        self.assertTrue(summarize("") is None)
+    def test_empty_text_summarization_is_empty_string(self):
+        self.assertEquals(summarize(""), u"")
 
-    def test_empty_corpus_summarization_is_none(self):
-        self.assertTrue(summarize_corpus([]) is None)
+    def test_empty_text_summarization_with_split_is_empty_list(self):
+        self.assertEquals(summarize("", split=True), [])
 
-    def test_corpus_summarization_ratio(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+    def test_empty_corpus_summarization_is_empty_list(self):
+        self.assertEquals(summarize_corpus([]), [])
 
-        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
-            text = f.read()
+    def test_corpus_summarization_ratio(self):
+        text = self._get_text_from_test_data("mihalcea_tarau.txt")
 
         # Generate the corpus.
         sentences = text.split('\n')
@@ -142,10 +125,7 @@ def test_corpus_summarization_ratio(self):
             self.assertEqual(len(selected_docs), expected_summary_length)
 
     def test_repeated_keywords(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "testrepeatedkeywords.txt")) as f:
-            text = f.read()
+        text = self._get_text_from_test_data("testrepeatedkeywords.txt")
 
         kwds = keywords(text)
         self.assertTrue(len(kwds.splitlines()))
@@ -157,10 +137,7 @@ def test_repeated_keywords(self):
         self.assertTrue(len(kwds_lst))
 
     def test_keywords_runs(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt")) as f:
-            text = f.read()
+        text = self._get_text_from_test_data("mihalcea_tarau.txt")
 
         kwds = keywords(text)
         self.assertTrue(len(kwds.splitlines()))
@@ -171,20 +148,24 @@ def test_keywords_runs(self):
         kwds_lst = keywords(text, split=True)
         self.assertTrue(len(kwds_lst))
 
-    def test_low_distinct_words_corpus_summarization_is_none(self):
-        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
-
-        with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f:
-            text = f.read()
+    def test_low_distinct_words_corpus_summarization_is_empty_list(self):
+        text = self._get_text_from_test_data("testlowdistinctwords.txt")
 
         # Generate the corpus.
         sentences = text.split("\n")
         tokens = [sentence.split() for sentence in sentences]
         dictionary = Dictionary(tokens)
         corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]
 
-        self.assertTrue(summarize_corpus(corpus) is None)
+        self.assertEquals(summarize_corpus(corpus), [])
+
+    def test_low_distinct_words_summarization_is_empty_string(self):
+        text = self._get_text_from_test_data("testlowdistinctwords.txt")
+        self.assertEquals(summarize(text), u"")
 
+    def test_low_distinct_words_summarization_with_split_is_empty_list(self):
+        text = self._get_text_from_test_data("testlowdistinctwords.txt")
+        self.assertEquals(summarize(text, split=True), [])
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)