From 1ae22a218f384027ca9f94775894823014357ea0 Mon Sep 17 00:00:00 2001 From: dgw Date: Wed, 14 Jul 2021 00:29:24 -0500 Subject: [PATCH] wikipedia: skip messagebox template contents --- sopel/modules/wikipedia.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/sopel/modules/wikipedia.py b/sopel/modules/wikipedia.py index f9557eeed9..2e20001941 100644 --- a/sopel/modules/wikipedia.py +++ b/sopel/modules/wikipedia.py @@ -29,6 +29,7 @@ def __init__(self, section_name): self.section_name = section_name self.citations = False + self.messagebox = False self.span_depth = 0 self.div_depth = 0 @@ -49,7 +50,9 @@ def handle_starttag(self, tag, attrs): if attr[0] == 'class' and 'edit' in attr[1]: self.span_depth += 1 - elif tag == 'div': # We want to skip thumbnail text and the inexplicable table of contents, and as such also need to track div depth + elif tag == 'div': + # We want to skip thumbnail text and the inexplicable table of contents, + # and as such also need to track div depth if self.div_depth: self.div_depth += 1 else: @@ -57,6 +60,25 @@ def handle_starttag(self, tag, attrs): if attr[0] == 'class' and ('thumb' in attr[1] or attr[1] == 'toc'): self.div_depth += 1 + elif tag == 'table': + # Message box templates are what we want to ignore here + for attr in attrs: + if ( + attr[0] == 'class' + and any(classname in attr[1].lower() for classname in [ + # Most of list from https://en.wikipedia.org/wiki/Template:Mbox_templates_see_also + 'ambox', # messageboxes on article pages + 'cmbox', # messageboxes on category pages + 'imbox', # messageboxes on file (image) pages + 'tmbox', # messageboxes on talk pages + 'fmbox', # header and footer messageboxes + 'ombox', # messageboxes on other types of page + 'mbox', # for messageboxes that are used in different namespaces and change their presentation accordingly + 'dmbox', # for disambiguation messageboxes + ]) + ): + self.messagebox = True + elif tag == 'ol': for attr in attrs: if attr[0] == 'class' and 'references' in attr[1]: @@ -71,9 +93,11 @@ def handle_endtag(self, tag): self.span_depth -= 1 if self.div_depth and tag == 'div': self.div_depth -= 1 + if self.messagebox and tag == 'table': + self.messagebox = False def handle_data(self, data): - if self.consume and not any([self.citations, self.span_depth, self.div_depth]): + if self.consume and not any([self.citations, self.messagebox, self.span_depth, self.div_depth]): if not (self.is_header and data == self.section_name): # Skip the initial header info only self.result += data