diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bc4acd608..2f7ef32f59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Changes - bigram construction can now support multiple bigrams within one sentence * Fixed issue #838, RuntimeWarning: overflow encountered in exp (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895)) * Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884)) +* Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887) 0.13.2, 2016-08-19 diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 234dcec377..0779011999 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -158,6 +158,11 @@ def summarize_corpus(corpus, ratio=0.2): _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) + # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. + if len(graph.nodes()) < 3: + logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") + return + pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) @@ -193,6 +198,10 @@ def summarize(text, ratio=0.2, word_count=None, split=False): logger.warning("Input text is empty.") return + # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). + if len(sentences) == 1: + raise ValueError("input must have more than one sentence") + # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") diff --git a/gensim/test/test_data/testlowdistinctwords.txt b/gensim/test/test_data/testlowdistinctwords.txt new file mode 100644 index 0000000000..70e20fa3d3 --- /dev/null +++ b/gensim/test/test_data/testlowdistinctwords.txt @@ -0,0 +1,10 @@ +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. \ No newline at end of file diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index fde845dc93..bd215efcab 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -87,6 +87,17 @@ def test_text_summarization_raises_exception_on_short_input_text(self): text = "\n".join(text.split('\n')[:8]) self.assertTrue(summarize(text) is not None) + + def test_text_summarization_returns_input_on_single_input_sentence(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: + text = f.read() + + # Keeps the first sentence only. + text = text.split('\n')[0] + + self.assertRaises(ValueError,summarize,text) def test_corpus_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -160,6 +171,20 @@ def test_keywords_runs(self): kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst)) + def test_low_distinct_words_corpus_summarization_is_none(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f: + text = f.read() + + # Generate the corpus. + sentences = text.split("\n") + tokens = [sentence.split() for sentence in sentences] + dictionary = Dictionary(tokens) + corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] + + self.assertTrue(summarize_corpus(corpus) is None) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 77c4212831..7ac953d847 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -12,6 +12,7 @@ import os import sys import types +import logging import unittest from gensim.corpora.wikicorpus import WikiCorpus @@ -21,16 +22,18 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2' +logger = logging.getLogger(__name__) class TestWikiCorpus(unittest.TestCase): def setUp(self): wc = WikiCorpus(datapath(FILENAME)) - def test_get_texts_returns_generator_of_lists(self): + logger.debug("Current Python Version is "+str(sys.version_info)) if sys.version_info < (2, 7, 0): return + wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertEqual(type(l), types.GeneratorType)