From 4591d2719bba8d084248cad8e18189f168f018c1 Mon Sep 17 00:00:00 2001 From: Federico Barrios Date: Sun, 17 Sep 2017 00:38:37 -0300 Subject: [PATCH] Newlines are now ignored while splitting by sentences in summarizing. --- gensim/summarization/textcleaner.py | 2 +- gensim/test/test_data/mihalcea_tarau.txt | 16 +++---- .../mihalcea_tarau_multiline.summ.txt | 11 +++++ .../test_data/mihalcea_tarau_multiline.txt | 42 +++++++++++++++++++ gensim/test/test_summarization.py | 16 +++++++ 5 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 gensim/test/test_data/mihalcea_tarau_multiline.summ.txt create mode 100644 gensim/test/test_data/mihalcea_tarau_multiline.txt diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 404c44b18e..d48951f9f9 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -22,7 +22,7 @@ SEPARATOR = r"@" -RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE | re.DOTALL) AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE) AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE) AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE) diff --git a/gensim/test/test_data/mihalcea_tarau.txt b/gensim/test/test_data/mihalcea_tarau.txt index 6c90af2556..c13c7b24cf 100644 --- a/gensim/test/test_data/mihalcea_tarau.txt +++ b/gensim/test/test_data/mihalcea_tarau.txt @@ -1,11 +1,11 @@ -AP880911-0016 -AP-NR-09-11-88 0423EDT r i -BC-HurricaneGilbert 09-11 0339 -BC-Hurricane Gilbert,0348 -Hurricane Gilbert Heads Toward Dominican Coast -By RUDDY GONZALEZ -Associated Press Writer -SANTO DOMINGO, Dominican Republic (AP) +AP880911-0016. +AP-NR-09-11-88 0423EDT r i. +BC-HurricaneGilbert 09-11 0339- +BC-Hurricane Gilbert,0348. +Hurricane Gilbert Heads Toward Dominican Coast. +By RUDDY GONZALEZ. +Associated Press Writer. +SANTO DOMINGO, Dominican Republic (AP). Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas. The storm was approaching from the southeast with sustained winds of 75 mph gusting to 92 mph. ``There is no need for alarm,'' Civil Defense Director Eugenio Cabral said in a television alert shortly before midnight Saturday. diff --git a/gensim/test/test_data/mihalcea_tarau_multiline.summ.txt b/gensim/test/test_data/mihalcea_tarau_multiline.summ.txt new file mode 100644 index 0000000000..6bb6aa8331 --- /dev/null +++ b/gensim/test/test_data/mihalcea_tarau_multiline.summ.txt @@ -0,0 +1,11 @@ +Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil +Defense alerted its heavily populated south coast to prepare for high winds, +heavy rains and high seas. +The National Hurricane Center in Miami reported its position at 2 a.m. Sunday +at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, +Puerto Rico, and 200 miles southeast of Santo Domingo. +The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving +westward at 15 mph with a ``broad area of cloudiness and heavy weather'' +rotating around the center of the storm. +Strong winds associated with the Gilbert brought coastal flooding, strong +southeast winds and up to 12 feet feet to Puerto Rico's south coast. \ No newline at end of file diff --git a/gensim/test/test_data/mihalcea_tarau_multiline.txt b/gensim/test/test_data/mihalcea_tarau_multiline.txt new file mode 100644 index 0000000000..b28c348602 --- /dev/null +++ b/gensim/test/test_data/mihalcea_tarau_multiline.txt @@ -0,0 +1,42 @@ +AP880911-0016. +AP-NR-09-11-88 0423EDT r i. +BC-HurricaneGilbert 09-11 0339. +BC-Hurricane Gilbert,0348. +Hurricane Gilbert Heads Toward Dominican Coast. +By RUDDY GONZALEZ. +Associated Press Writer. +SANTO DOMINGO, Dominican Republic (AP). +Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil +Defense alerted its heavily populated south coast to prepare for high winds, +heavy rains and high seas. +The storm was approaching from the southeast with sustained winds of 75 mph +gusting to 92 mph. +``There is no need for alarm,'' Civil Defense Director Eugenio Cabral said in +a television alert shortly before midnight Saturday. +Cabral said residents of the province of Barahona should closely follow +Gilbert's movement. +An estimated 100,000 people live in the province, including 70,000 in the city +of Barahona, about 125 miles west of Santo Domingo. +Tropical Storm Gilbert formed in the eastern Caribbean and strengthened into a +hurricane Saturday night. +The National Hurricane Center in Miami reported its position at 2 a.m. Sunday +at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, +Puerto Rico, and 200 miles southeast of Santo Domingo. +The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving +westward at 15 mph with a ``broad area of cloudiness and heavy weather'' +rotating around the center of the storm. +The weather service issued a flash flood watch for Puerto Rico and the Virgin +Islands until at least 6 p.m. Sunday. +Strong winds associated with the Gilbert brought coastal flooding, strong +southeast winds and up to 12 feet feet to Puerto Rico's south coast. +There were no reports of casualties. +San Juan, on the north coast, had heavy rains and gusts Saturday, but they +subsided during the night. +On Saturday, Hurricane Florence was downgraded to a tropical storm and its +remnants pushed inland from the U.S. Gulf Coast. +Residents returned home, happy to find little damage from 80 mph winds and +sheets of rain. +Florence, the sixth named storm of the 1988 Atlantic storm season, was the +second hurricane. +The first, Debby, reached minimal hurricane strength briefly before hitting the +Mexican coast last month. \ No newline at end of file diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index a1ba6f223a..7ab9fd665f 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -38,6 +38,22 @@ def test_text_summarization(self): self.assertEqual(generated_summary, summary) + def test_text_summarization_multiline(self): + """This tests that the newlines don't affect the result of the summarization.""" + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau_multiline.txt"), mode="r") as f: + text = f.read() + + # Makes a summary of the text. + generated_summary = summarize(text) + + # To be compared to the method reference. + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau_multiline.summ.txt"), mode="r") as f: + summary = f.read() + + self.assertEqual(generated_summary, summary) + def test_corpus_summarization(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data')