diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 06873e77d3..b7a18f02db 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -68,9 +68,9 @@ """Math content.""" RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) """All other tags.""" -RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) +RE_P12 = re.compile(r'(({\|)|(\|-(?!\d))|(\|}))(.*?)(?=\n)', re.UNICODE) """Table formatting.""" -RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) +RE_P13 = re.compile(r'(?<=(\n[ ])|(\n\n)|([ ]{2})|(.\n)|(.\t))(\||\!)([^[\]\n]*?\|)*', re.UNICODE) """Table cell formatting.""" RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) """Categories.""" @@ -78,7 +78,12 @@ """Remove File and Image templates.""" RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE) """Capture interlinks text and article linked""" - +RE_P17 = re.compile( + r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|' + '(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))', + re.UNICODE +) +"""Table markup""" IGNORED_NAMESPACES = [ 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject', @@ -185,10 +190,14 @@ def remove_markup(text, promote_remaining=True, simplify_links=True): if simplify_links: text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only # remove table markup - - text = text.replace('||', '\n|') # each table cell on a separate line + text = text.replace("!!", "\n|") # each table head cell on a separate line + text = text.replace("|-||", "\n|") # for cases where a cell is filled with '-' text = re.sub(RE_P12, '\n', text) # remove formatting lines - text = re.sub(RE_P13, '\n\\3', text) # leave only cell content + text = text.replace('|||', '|\n|') # each table cell on a separate line(where |{{a|b}}||cell-content) + text = text.replace('||', '\n|') # each table cell on a separate line + text = re.sub(RE_P13, '\n', text) # leave only cell content + text = re.sub(RE_P17, '\n', text) # remove formatting lines + # remove empty mark-up text = text.replace('[]', '') # stop if nothing changed between two iterations or after a fixed number of iterations diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 724db86957..d8c8fe31c6 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -645,6 +645,18 @@ def test_max_token_len_set(self): corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False) self.assertTrue(u'collectivization' in next(corpus.get_texts())) + def test_removed_table_markup(self): + """ + Check if all the table markup has been removed. + """ + enwiki_file = datapath('enwiki-table-markup.xml.bz2') + corpus = self.corpus_class(enwiki_file) + texts = corpus.get_texts() + table_markup = ["style", "class", "border", "cellspacing", "cellpadding", "colspan", "rowspan"] + for text in texts: + for word in table_markup: + self.assertTrue(word not in text) + # #TODO: sporadic failure to be investigated # def test_get_texts_returns_generator_of_lists(self): # corpus = self.corpus_class(self.enwiki) diff --git a/gensim/test/test_data/enwiki-table-markup.xml.bz2 b/gensim/test/test_data/enwiki-table-markup.xml.bz2 new file mode 100644 index 0000000000..8b5b3ec44f Binary files /dev/null and b/gensim/test/test_data/enwiki-table-markup.xml.bz2 differ