-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy patharchive_url.py
executable file
·65 lines (59 loc) · 2.38 KB
/
archive_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#! /usr/bin/env python
from __future__ import unicode_literals
import mwclient
import mwparserfromhell
import re
from theobot import bot
from theobot import password
# CC-BY-SA Theopolisme
global site
site = mwclient.Site('en.wikipedia.org')
site.login(password.username, password.password)
class ArchiveURLProcessor():
def __init__(self):
self.donenow = 0
print "ArchiveURLProcessor() initialized."
return
def start(self,pages):
for page in pages:
self.process_page(page)
def process_page(self,page):
if bot.donenow("User:Theo's Little Bot/disable/archiveurl",donenow=self.donenow,donenow_div=5) == True:
print "Processing " + page.encode("ascii", "replace")
page = site.Pages[page]
text = page.edit()
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
if "cite web" in template.name and template.has_param('url') == False:
archiveurl = None
for param in template.params:
items = param.strip().split('=')
if items[0] == 'url':
continue
if items[0] == 'archiveurl':
archiveurl = items[0]
if archiveurl is not None:
if re.search(r"web\.archive\.org",unicode(template),flags=re.U) != None:
try:
new_url = re.search(r"\|[\s]*archiveurl[\s]*=[\s]*(?:http://|https://)web.archive.org/web/\d*/(.*?)(?:\||}})", unicode(template), flags=re.UNICODE | re.M).groups(0)[0]
except AttributeError:
try:
new_url = re.search(r"\|[\s]*archiveurl[\s]*=[\s]*(?:http://|https://)pandora.nla.gov.au/nph-wb/\d*/(.*?)(?:\||}})", unicode(template), flags=re.UNICODE | re.M).groups(0)[0]
except AttributeError:
print "I don't recognize the archive structure, sadly. Skipping."
continue
if re.search(r"(http|https)://",new_url.strip()) == None:
new_url = u"http://" + new_url
template.add("url", new_url.strip())
print "Added url parameter to {{cite web}} template."
else:
continue
text = unicode(wikicode)
try:
page.save(text,summary="Fixing references: adding url parameter ([[WP:BOT|bot]] - [[User:Theo's Little Bot/disable/archiveurl|disable]])")
self.donenow += 1 # we only count it as "done" when we make a change
except:
print "Unable to save page; skipping."
pages = bot.cats_recursive('Category:Pages with archiveurl citation errors')
archivebot = ArchiveURLProcessor()
archivebot.start(pages)