-
Notifications
You must be signed in to change notification settings - Fork 0
/
icpc
executable file
·232 lines (192 loc) · 7.91 KB
/
icpc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Usage:
./icpc -n --years 2010 2016 -- "select * from icpc" # stateless query...
./icpc --years 2010 # scrape for year 2010 (keeps previously scraped data)
./icpc -c --years 2016 # clear scraped data from 2010, then scrape for 2016
./icpc "select rank,year,contest from rankings where univ like 'stanford%'"
# Override a single region mapping from stdin (no argument or '-' after -f)
echo -e "USA (override)\tHarvard University" \
| ./icpc -f -r "select * from icpc WHERE year=2016 ORDER BY rank LIMIT 3"
"""
import argparse
import re
import sqlite3
import sys
import urllib.request as UR
import xml.etree.ElementTree as ET
from common import connect_db
SUPPORTED_YEARS = list(range(2005, 2017))
def query(s, db):
for row in db.execute(s):
print(row)
def scrape_ranking(tree, year, db):
pos = 0
for tbl_name in "medalTable", "rankTable":
v_print(1, "parsing table", tbl_name)
for tbl in tree.findall(
".//{http://www.w3.org/1999/xhtml}table[@id='%s']/*" % tbl_name
):
row = [
tr.text.replace("\n", "").strip() or None
for tr in tbl.getchildren() if tr.text is not None
]
if row and row[0]:
pos += 1
v_print(2, "parsed row#%d: %s" % (pos, tuple(row)))
while len(row) > 4:
row.pop() # pop "Last solved"
if len(row) < 4:
row.append(None) # add unknown penalty
db.execute("""INSERT INTO rankings VALUES
('icpc', ?, ?, ?, ?, ?)""", [year] + row)
def scrape_regions(tree, year, db):
count = 0
for entries in tree.findall(
".//{http://www.w3.org/1999/xhtml}table[@id='regionTable']/*"
)[1:]:
row = [tr.text for tr in entries.getchildren()]
v_print(2, "parsed region from %d: %s" % (year, row))
count += db.execute(
"""INSERT OR IGNORE INTO icpc_geo VALUES (?, ?)""", row
).rowcount
v_print(1, "scraped %d new entries from %d" % (count, year))
def fetch(year):
url = "https://icpc.baylor.edu/community/results-%d" % year
contents = ""
if FLAGS.cache:
path = url.partition("//")[-1].replace("/", "_") + ".html"
try:
v_print(1, "loading HTML for year %d @ %s" % (year, path))
with open(path) as f:
contents = "".join(f.readlines())
except Exception as e:
v_print(0, "warning: no readable cached HTML @ %s: %s" % (path, e))
if not contents:
v_print(1, "scraping from:", url)
resp = UR.urlopen(UR.Request(url))
if resp.code != 200:
raise Exception("Failed to fetch results for year: %d" % year)
contents = resp.read().decode('utf-8')
if FLAGS.fix:
# years 2013-2015 have unclosed <meta http-equiv=...> tag
contents = re.sub(r"(<meta http-[^>]*[^/])>", r"\1/>", contents)
# years 2011-2012 and 2008 that have <tr>s within <tbody> tag
contents = re.sub(r"</?tbody>", "", contents)
# years <= 2010 use entities "&<symbol>;", not Unicode characters
ent_to_uni = {
"aacute": "á",
"atilde": "ã",
"eacute": "é",
"egrave": "è",
"nbsp": " ",
"oacute": "ó",
}
contents = re.sub(r'(.dtd")', r"\1 [" + "".join(
"\n<!ENTITY %s '%s'>" % (ent, uni)
for ent, uni in ent_to_uni.items()
) + "\n]", contents)
# years <= 2010 also don't have id attribute in <table>
if year <= 2010:
contents = re.sub(
r'(<table)', r'\1 id="rankTable"', contents, count=1)
region_tbl_ord = 3
for n in range(region_tbl_ord):
contents = re.sub(
r'(<table)(?! n=)', r'\1 n="%d"' % n, contents,
count=1
)
contents = contents.replace(
'<table n="2"', '<table id="regionTable"'
)
# year 2006 has extra <div> in table entry for DePaul University
if year == 2006:
contents = re.sub("<div><p>(.*)</p></div>", r"\1", contents)
if FLAGS.cache:
with open(path, 'w') as f:
f.write(contents)
f.write("\n")
v_print(1, "saved HTML for year %d to: %s" % (year, path))
return ET.fromstring(contents)
def main(args=sys.argv):
parser = argparse.ArgumentParser(
prog=args[0],
description="""""")
parser.add_argument(
"-v", "--verbose", dest='v', action='count', default=0,
help="increase output verbosity (can be repeated)")
parser.add_argument(
"-n", "--dry-run", action='store_true',
help="do not persist anything to the database file")
parser.add_argument(
"-c", "--clear", action='store_true',
help="clears the previously scraped ICPC data")
parser.add_argument(
"-y", "--years", type=int, nargs='+', default=[],
help="scrape for the specified years")
parser.add_argument(
"-r", "--regions", action='store_true',
help="scrape regions for some universities")
parser.add_argument(
"-f", "--regions-file", type=argparse.FileType('r'),
nargs='?', const='-',
help="read additional region to university mappings in the TSV format")
parser.add_argument(
"--cache", action='store_true',
help="save the HTML page(s) and avoid fetching for the cached ones")
parser.add_argument(
"--no-fix", dest='fix', action='store_false',
help="do not fix malformed HTML (e.g., unclosed tags)")
parser.add_argument("queries", metavar='QUERY', nargs='*')
args = parser.parse_args(args[1:])
if -1 in args.years:
args.years = SUPPORTED_YEARS
global FLAGS
FLAGS = args
v_print(1, "FLAGS:", args)
scrape_fns = [scrape_ranking] + ([scrape_regions] if FLAGS.regions else [])
with connect_db(FLAGS.dry_run) as db:
db.execute("""CREATE TABLE IF NOT EXISTS icpc_geo (
region TEXT NOT NULL,
univ TEXT NOT NULL PRIMARY KEY)""")
db.execute("""CREATE VIEW IF NOT EXISTS icpc AS
SELECT year, rank, r.univ, region, score, penalty
FROM rankings r LEFT JOIN icpc_geo g ON r.univ=g.univ
WHERE contest='icpc'
""")
if FLAGS.clear and not FLAGS.dry_run:
db.execute("DELETE FROM rankings WHERE contest='icpc'")
db.execute("DELETE FROM icpc_geo")
for year in FLAGS.years:
tree = fetch(year)
for scrape in scrape_fns:
try:
scrape(tree, year, db)
except sqlite3.IntegrityError as e:
v_print(0, "warning: skipping already scraped year %d"
% year)
if args.regions_file:
for num, line in enumerate(
map(lambda s: s.strip(), args.regions_file), 1
):
try:
r, u = line.split("\t")
v_print(2, "parsed mapping from %s: %s"
% (args.regions_file.name, (r, u)))
db.execute("INSERT OR REPLACE INTO icpc_geo VALUES (?, ?)",
(r, u))
except Exception as e:
v_print(0, "error: failed to parse %s @ line#%d: %s"
% (args.regions_file.name, num, e))
db.commit()
for s in FLAGS.queries:
query(s, db)
print()
return 0
def v_print(min_verbosity, *args, **kwargs):
if FLAGS.v >= min_verbosity:
kwargs.setdefault('file', sys.stderr)
print(*args, **kwargs)
if __name__ == '__main__':
sys.exit(main())