Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/RaRe-Technologies/gensim
Browse files Browse the repository at this point in the history
…into develop
  • Loading branch information
harshuljain13 committed Sep 29, 2016
2 parents f958d46 + 1c34dfc commit af0c33d
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Changes
- bigram construction can now support multiple bigrams within one sentence
* Fixed issue #838, RuntimeWarning: overflow encountered in exp (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895))
* Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884))
* Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887)


0.13.2, 2016-08-19
Expand Down
9 changes: 9 additions & 0 deletions gensim/summarization/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ def summarize_corpus(corpus, ratio=0.2):
_set_graph_edge_weights(graph)
_remove_unreachable_nodes(graph)

# Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends.
if len(graph.nodes()) < 3:
logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
return

pagerank_scores = _pagerank(graph)

hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)
Expand Down Expand Up @@ -193,6 +198,10 @@ def summarize(text, ratio=0.2, word_count=None, split=False):
logger.warning("Input text is empty.")
return

# If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
if len(sentences) == 1:
raise ValueError("input must have more than one sentence")

# Warns if the text is too short.
if len(sentences) < INPUT_MIN_LENGTH:
logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.")
Expand Down
10 changes: 10 additions & 0 deletions gensim/test/test_data/testlowdistinctwords.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
here here.
there there.
here here.
there there.
here here.
there there.
here here.
there there.
here here.
there there.
25 changes: 25 additions & 0 deletions gensim/test/test_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ def test_text_summarization_raises_exception_on_short_input_text(self):
text = "\n".join(text.split('\n')[:8])

self.assertTrue(summarize(text) is not None)

def test_text_summarization_returns_input_on_single_input_sentence(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()

# Keeps the first sentence only.
text = text.split('\n')[0]

self.assertRaises(ValueError,summarize,text)

def test_corpus_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
Expand Down Expand Up @@ -160,6 +171,20 @@ def test_keywords_runs(self):
kwds_lst = keywords(text, split=True)
self.assertTrue(len(kwds_lst))

def test_low_distinct_words_corpus_summarization_is_none(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f:
text = f.read()

# Generate the corpus.
sentences = text.split("\n")
tokens = [sentence.split() for sentence in sentences]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

self.assertTrue(summarize_corpus(corpus) is None)

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
5 changes: 4 additions & 1 deletion gensim/test/test_wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import sys
import types
import logging
import unittest

from gensim.corpora.wikicorpus import WikiCorpus
Expand All @@ -21,16 +22,18 @@
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2'

logger = logging.getLogger(__name__)

class TestWikiCorpus(unittest.TestCase):

def setUp(self):
wc = WikiCorpus(datapath(FILENAME))


def test_get_texts_returns_generator_of_lists(self):
logger.debug("Current Python Version is "+str(sys.version_info))
if sys.version_info < (2, 7, 0):
return

wc = WikiCorpus(datapath(FILENAME))
l = wc.get_texts()
self.assertEqual(type(l), types.GeneratorType)
Expand Down

0 comments on commit af0c33d

Please sign in to comment.