Skip to content

Commit

Permalink
Add article interlinks to the output of gensim.scripts.segment_wiki.
Browse files Browse the repository at this point in the history
…Fix piskvorky#1712 (piskvorky#1839)

* promoting the markup gives up information needed to find the intelinks

* Add interlinks to the output of `segment_wiki`

* New output format is (str, list of (str, str), list of str, reflecting
structure (title, [(section_heading, section_content), ...], [interlink, ...])

* `filter_wiki` in WikiCorpus will not promote uncaught markup to plain text
as this will give up valuable information for the interlink discovery

* Fixed PEP 8

* Refactoring identation and variable names

* Removed debugging code from script

* Fixed a bug where interlinks with a description or multiple names where disregarded

* Due to preprocessing in `filter_wiki` interlinks containing alternative names had
one of the 2 `[` and `]` characters removed. The regex now takes that into account.

* Now stripping whitespace off section titles

* Unit test `gensim.scripts.segment_wiki`

* Initiate unit testing for all scripts.

* Check for expected len given article filtering (namespace, size in characters and redirections).

* Check for yielded title, section headings and texts as well as interlinks yielded from generator.

* Check that the same is correctly persisted in JSON.

* Fix PEP 8

* Fix Python 3.5 compatibility

* Section text now completely clean from wiki markup

* Refactored filtering functions in ``wikicorpus.py` so that
uncaught markup can be optionally promoted to plain text

* Interlink extraction logic moved to `wikicorpus.py`

* Unit tests modified accordingly

* Added extra logging info to troublehsoot weird Travis behavior

* Fix PEP 8

* pin workers for segment_and_write_all_articles

* Get rid of debugging stuff

* Get rid of global logger

* Interlinks are now mapping from the linked article's title to the actual interlink text

* Used boolean argument with default argument in `filter_wiki`. The default value keeps the old functionality
so that existing code does not brake

* Overriding the default argument causes interlinks to not be simplified and lets `find_interlinks` create the mappings

* Moved regex outside function

* Interlink extraction is now optional and controlled with the `-i` command line argument

* PEP 8 long lines

* made scripts tests aware of the optional interlinks argument

* Updated script help output for interlinks
  • Loading branch information
steremma authored and menshikh-iv committed Jan 31, 2018
1 parent 1f357a7 commit aa10f79
Show file tree
Hide file tree
Showing 4 changed files with 536 additions and 335 deletions.
62 changes: 51 additions & 11 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
"""


import bz2
import logging
import multiprocessing
Expand All @@ -45,7 +44,6 @@
TOKEN_MIN_LEN = 2
TOKEN_MAX_LEN = 15


RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)
"""Comments."""
RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)
Expand Down Expand Up @@ -78,6 +76,8 @@
"""Categories."""
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
"""Remove File and Image templates."""
RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
"""Capture interlinks text and article linked"""

IGNORED_NAMESPACES = [
'Wikipedia', 'Category', 'File', 'Portal', 'Template',
Expand All @@ -93,34 +93,70 @@
"""


def filter_wiki(raw):
def find_interlinks(raw):
"""Find all interlinks to other articles in the dump.
Parameters
----------
raw : str
Unicode or utf-8 encoded string.
Returns
-------
dict
Mapping from the linked article to the actual text found.
"""
filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False)
interlinks_raw = re.findall(RE_P16, filtered)

interlinks = {}
for parts in [i.split('|') for i in interlinks_raw]:
actual_title = parts[0]
try:
interlink_text = parts[1]
interlinks[actual_title] = interlink_text
except IndexError:
interlinks[actual_title] = actual_title

legit_interlinks = {i: j for i, j in interlinks.items() if '[' not in i and ']' not in i}
return legit_interlinks


def filter_wiki(raw, promote_remaining=True, simplify_links=True):
"""Filter out wiki markup from `raw`, leaving only text.
Parameters
----------
raw : str
Unicode or utf-8 encoded string.
promote_remaining : bool
Whether uncaught markup should be promoted to plain text.
simplify_links : bool
Whether links should be simplified keeping only their description text.
Returns
-------
str
`raw` without markup.
"""
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.to_unicode(raw, 'utf8', errors='ignore')
text = utils.decode_htmlentities(text) # '&amp;nbsp;' --> '\xa0'
return remove_markup(text)
return remove_markup(text, promote_remaining, simplify_links)


def remove_markup(text):
def remove_markup(text, promote_remaining=True, simplify_links=True):
"""Filter out wiki markup from `text`, leaving only text.
Parameters
----------
text : str
String containing markup.
promote_remaining : bool
Whether uncaught markup should be promoted to plain text.
simplify_links : bool
Whether links should be simplified keeping only their description text.
Returns
-------
Expand All @@ -145,8 +181,11 @@ def remove_markup(text):
text = re.sub(RE_P11, '', text) # remove all remaining tags
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only

if simplify_links:
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
# remove table markup

text = text.replace('||', '\n|') # each table cell on a separate line
text = re.sub(RE_P12, '\n', text) # remove formatting lines
text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
Expand All @@ -156,9 +195,9 @@ def remove_markup(text):
if old == text or iters > 2:
break

# the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
# TODO is this really desirable?
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
if promote_remaining:
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text

return text


Expand Down Expand Up @@ -333,7 +372,7 @@ def extract_pages(f, filter_namespaces=False):
text = None

pageid = elem.find(pageid_path).text
yield title, text or "", pageid # empty page will yield None
yield title, text or "", pageid # empty page will yield None

# Prune the element tree, as per
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
Expand Down Expand Up @@ -461,6 +500,7 @@ class WikiCorpus(TextCorpus):
>>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping
"""

def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
Expand Down
Loading

0 comments on commit aa10f79

Please sign in to comment.