-
Notifications
You must be signed in to change notification settings - Fork 4
/
wayback_downloader.py
195 lines (169 loc) · 5.8 KB
/
wayback_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
import sys
import errno
import urllib
import urllib2
from urlparse import urlparse
import time
from datetime import datetime
from collections import deque
from multiprocessing import Pool
USAGE = """Usage:
python <script.py> {--help|-h}
python <script.py> [--threads <threads>] [--matchType {exact|prefix|host|domain}] [--from <timestamp>] [--to <timestamp>] [--limit <snapshots>] [--dry] <url>
Options:
--help, -h Display this help message and exit
--threads, -T Number of downloading threads (default: 10)
--matchType, -m What results will be downloaded based on <url>
exact Download results matching exactly <url>
prefix Download results under the path <url>
host Download results from host of <url>
domain Download results from host of <url> and all subhosts of <url>
--from, -f Download results that were captured after this timestamp
--to, -t Download results that were captured before this timestamp
Both <from> and <to> must be a prefix of "yyyyMMddhhmmss"
--limit, -l Download at most <snapshots> snapshots
--dry, -d List items to be downloaded without downloading them
Example:
Use the following command:
python <script.py> --matchType prefix --from 2010 --to 201606 --limit 1000 example.org
To download at most 1000 abarity pages under example.org between the year of 2010 and the month of June 2016 (inclusive).
For more information, see: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md"""
THREADS = 10
def parse_timestamp(timestamp):
year = int(timestamp[:4])
month = int(timestamp[4:6])
day = int(timestamp[6:8])
hour = int(timestamp[8:10])
minute = int(timestamp[10:12])
second = int(timestamp[12:14])
return datetime(year, month, day, hour, minute, second)
def write(response, filename, timestamp):
if os.path.isdir(filename):
dirname, basename = filename, "index.html"
else:
dirname, basename = os.path.split(filename)
if not os.path.exists(dirname):
try:
os.makedirs(dirname)
except OSError as e:
if e.errno != errno.EEXIST:
return False
filename = os.path.join(dirname, basename)
with open(filename, "wb") as file:
file.write(response.read())
os.utime(filename, (time.time(), time.mktime(parse_timestamp(timestamp).timetuple())))
return True
return False
def download(row):
urlkey, timestamp, original, mimetype, statuscode, digest, length = row
parsed = urlparse(urllib.unquote(original))
domain = parsed.netloc.split(':')[0]
filename = os.path.join(os.path.dirname(sys.argv[0]), domain, parsed.path.lstrip('/'))
url = "http://web.archive.org/web/{}if_/{}".format(timestamp, original)
response = None
try:
response = urllib2.urlopen(url)
if response.getcode() == 200:
return write(response, filename, timestamp)
except urllib2.URLError as e:
return False
except urllib2.HTTPError as e:
return False
finally:
if response is not None:
response.close()
def list_rows(**params):
response = None
try:
response = urllib2.urlopen("http://web.archive.org/cdx/search/cdx?" + urllib.urlencode(params))
if response.getcode() == 200:
rows = [line.split() for line in response]
except urllib2.URLError as e:
return False
except urllib2.HTTPError as e:
return False
finally:
if response is not None:
response.close()
total = len(rows)
unique = set()
rows = [unique.add(row[2]) or row for row in rows if row[2] not in unique]
duplicates = total - len(rows)
return rows, duplicates
def dry_run(**params):
rows, duplicates = list_rows(**params)
print "Found {} snapshot{}{}:".format(len(rows), "" if len(rows) == 1 else "s", " (removed {} duplicate{})".format(duplicates, "" if duplicates == 1 else "s") if duplicates else "")
for row in rows:
print "{}: {}".format(parse_timestamp(row[1]).strftime("%Y-%m-%d %H:%M:%S"), row[2])
def download_all(threads, **params):
rows, duplicates = list_rows(**params)
print "Downloading {} snapshot{}{}...".format(len(rows), "" if len(rows) == 1 else "s", " (removed {} duplicate{})".format(duplicates, "" if duplicates == 1 else "s") if duplicates else "")
total = len(rows)
pool = Pool(threads)
while rows:
i = 0
for success in pool.imap(download, rows):
urlkey, timestamp, original, mimetype, statuscode, digest, length = rows[i]
if success:
del rows[i]
else:
i += 1
count = total - len(rows)
print "\rProgress: {}/{} ({:.2f}%)".format(count, total, 100.0 * count / total),
# print "\r{}/{} ({:.2f}%), {} failed.".format(count, total, 100.0 * count / total, i),
def parseargs(argv):
if not argv:
return None
args = {
"threads": ["--threads", "-T"],
"matchType": ["--matchType", "-m"],
"from": ["--from", "-f"],
"to": ["--to", "-t"],
"limit": ["--limit", "-l"],
}
flags = {
"help": ["--help", "-h"],
"dry": ["--dry", "-d"]
}
params = {}
for arg, names in args.iteritems():
for name in names:
if name in argv:
index = argv.index(name)
argv.pop(index)
value = argv.pop(index)
params[arg] = value
for flag, names in flags.iteritems():
for name in names:
if name in argv:
index = argv.index(name)
argv.pop(index)
params[flag] = True
if len(argv) > 1:
return None
if argv:
params["url"] = argv.pop()
return params
def main():
params = parseargs(sys.argv[1:])
if params is None or "help" in params:
print USAGE.replace("<script.py>", sys.argv[0])
sys.exit(1)
if "threads" in params:
threads = params.pop("threads")
try:
threads = int(threads)
except ValueError:
print "Invalid --threads option"
sys.exit(1)
else:
threads = THREADS
if "dry" in params:
del params["dry"]
dry_run(**params)
else:
download_all(threads, **params)
sys.exit(0)
if __name__ == "__main__":
main()