Skip to content

Commit

Permalink
Merge pull request #2159 from sopel-irc/wikipedia-skip-messagebox
Browse files Browse the repository at this point in the history
wikipedia: skip messagebox template contents

Cherry-picked from master @ 6ac286b
  • Loading branch information
dgw committed Aug 5, 2021
1 parent 6cd0b61 commit d6cd69b
Showing 1 changed file with 26 additions and 2 deletions.
28 changes: 26 additions & 2 deletions sopel/modules/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(self, section_name):
self.section_name = section_name

self.citations = False
self.messagebox = False
self.span_depth = 0
self.div_depth = 0

Expand All @@ -55,14 +56,35 @@ def handle_starttag(self, tag, attrs):
if attr[0] == 'class' and 'edit' in attr[1]:
self.span_depth += 1

elif tag == 'div': # We want to skip thumbnail text and the inexplicable table of contents, and as such also need to track div depth
elif tag == 'div':
# We want to skip thumbnail text and the inexplicable table of contents,
# and as such also need to track div depth
if self.div_depth:
self.div_depth += 1
else:
for attr in attrs:
if attr[0] == 'class' and ('thumb' in attr[1] or attr[1] == 'toc'):
self.div_depth += 1

elif tag == 'table':
# Message box templates are what we want to ignore here
for attr in attrs:
if (
attr[0] == 'class'
and any(classname in attr[1].lower() for classname in [
# Most of list from https://en.wikipedia.org/wiki/Template:Mbox_templates_see_also
'ambox', # messageboxes on article pages
'cmbox', # messageboxes on category pages
'imbox', # messageboxes on file (image) pages
'tmbox', # messageboxes on talk pages
'fmbox', # header and footer messageboxes
'ombox', # messageboxes on other types of page
'mbox', # for messageboxes that are used in different namespaces and change their presentation accordingly
'dmbox', # for disambiguation messageboxes
])
):
self.messagebox = True

elif tag == 'ol':
for attr in attrs:
if attr[0] == 'class' and 'references' in attr[1]:
Expand All @@ -77,9 +99,11 @@ def handle_endtag(self, tag):
self.span_depth -= 1
if self.div_depth and tag == 'div':
self.div_depth -= 1
if self.messagebox and tag == 'table':
self.messagebox = False

def handle_data(self, data):
if self.consume and not any([self.citations, self.span_depth, self.div_depth]):
if self.consume and not any([self.citations, self.messagebox, self.span_depth, self.div_depth]):
if not (self.is_header and data == self.section_name): # Skip the initial header info only
self.result += data

Expand Down

0 comments on commit d6cd69b

Please sign in to comment.