Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update rules for removing table markup. Fix #1710 #1954

Merged
merged 3 commits into from
Mar 15, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,18 @@
"""Math content."""
RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)
"""All other tags."""
RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)
RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE)
"""Table formatting."""
RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)
RE_P13 = re.compile(r'(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)
"""Table cell formatting."""
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""

RE_P17 = re.compile(r'(\n[ ]{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))|(^[ ]{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))', re.UNICODE)
"""Table markup"""
IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject',
Expand Down Expand Up @@ -185,10 +186,14 @@ def remove_markup(text, promote_remaining=True, simplify_links=True):
if simplify_links:
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
# remove table markup

text = text.replace('||', '\n|') # each table cell on a separate line
text = text.replace("!!", "\n|") # leave only cell content(in table head)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's a performance difference after this changes?

text = text.replace("|-||", "\n|") # for cases where a cell is filled with '-'
text = re.sub(RE_P12, '\n', text) # remove formatting lines
text = text.replace('|||', '|\n|') # each table cell on a separate line(where |{{a|b}}||cell-content)
text = text.replace('||', '\n|') # each table cell on a separate line
text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
text = re.sub(RE_P17, '', text) # remove formatting lines

# remove empty mark-up
text = text.replace('[]', '')
# stop if nothing changed between two iterations or after a fixed number of iterations
Expand Down
12 changes: 12 additions & 0 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,18 @@ def test_max_token_len_set(self):
corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False)
self.assertTrue(u'collectivization' in next(corpus.get_texts()))

def test_removed_table_markup(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like CI stuck on this test (and this failed), please debug it.

"""
Check if all the table markup has been removed.
"""
enwiki_file = datapath('enwiki-table-markup.xml.bz2')
corpus = self.corpus_class(enwiki_file)
texts = corpus.get_texts()
table_markup = ["style", "class", "border", "cellspacing", "cellpadding", "colspan", "rowspan"]
for text in texts:
for word in table_markup:
self.assertTrue(word not in text)

# #TODO: sporadic failure to be investigated
# def test_get_texts_returns_generator_of_lists(self):
# corpus = self.corpus_class(self.enwiki)
Expand Down
Binary file added gensim/test/test_data/enwiki-table-markup.xml.bz2
Binary file not shown.