Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wikipedia: skip messagebox template contents #2159

Merged
merged 1 commit into from
Aug 5, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions sopel/modules/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(self, section_name):
self.section_name = section_name

self.citations = False
self.messagebox = False
self.span_depth = 0
self.div_depth = 0

Expand All @@ -49,14 +50,35 @@ def handle_starttag(self, tag, attrs):
if attr[0] == 'class' and 'edit' in attr[1]:
self.span_depth += 1

elif tag == 'div': # We want to skip thumbnail text and the inexplicable table of contents, and as such also need to track div depth
elif tag == 'div':
# We want to skip thumbnail text and the inexplicable table of contents,
# and as such also need to track div depth
if self.div_depth:
self.div_depth += 1
else:
for attr in attrs:
if attr[0] == 'class' and ('thumb' in attr[1] or attr[1] == 'toc'):
self.div_depth += 1

elif tag == 'table':
# Message box templates are what we want to ignore here
for attr in attrs:
if (
attr[0] == 'class'
and any(classname in attr[1].lower() for classname in [
# Most of list from https://en.wikipedia.org/wiki/Template:Mbox_templates_see_also
'ambox', # messageboxes on article pages
'cmbox', # messageboxes on category pages
'imbox', # messageboxes on file (image) pages
'tmbox', # messageboxes on talk pages
'fmbox', # header and footer messageboxes
'ombox', # messageboxes on other types of page
'mbox', # for messageboxes that are used in different namespaces and change their presentation accordingly
'dmbox', # for disambiguation messageboxes
])
):
self.messagebox = True

elif tag == 'ol':
for attr in attrs:
if attr[0] == 'class' and 'references' in attr[1]:
Expand All @@ -71,9 +93,11 @@ def handle_endtag(self, tag):
self.span_depth -= 1
if self.div_depth and tag == 'div':
self.div_depth -= 1
if self.messagebox and tag == 'table':
self.messagebox = False

def handle_data(self, data):
if self.consume and not any([self.citations, self.span_depth, self.div_depth]):
if self.consume and not any([self.citations, self.messagebox, self.span_depth, self.div_depth]):
if not (self.is_header and data == self.section_name): # Skip the initial header info only
self.result += data

Expand Down