-
Notifications
You must be signed in to change notification settings - Fork 2
/
01pages.py
executable file
·98 lines (74 loc) · 2.94 KB
/
01pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Cкрейпинг/парсинг http://vybary2019.by на чистом Python без зависимостей.
"""
import csv
import os
import re
import sys
from urllib.request import urlopen
URL = 'http://vybary2019.by'
def write_csv(filename, data, header):
with open(filename, 'w', encoding='utf-8') as fd:
print('writing %s' % fd.name)
writer = csv.writer(fd, dialect='unix')
writer.writerow(header)
for row in data:
writer.writerow(row)
def get_page(url, cachefile, force=False):
"""
Fetch page and cookie from URL if local cachefile does not exist
"""
if os.path.exists(cachefile) and not force:
print('using cached ' + cachefile + ' (-f to force update)')
with open(cachefile, 'rb') as fc:
return fc.read().decode('utf-8')
else:
req = urlopen(url)
output = req.read()
print(f'caching {url} to {cachefile}')
with open(cachefile, 'wb') as cw:
cw.write(output)
return output.decode('utf-8')
def parse_region_list(content):
reregion = re.compile('href="(/regions/(\d+).html)">([^<]+)<')
return [[number, name, URL+path] for path, number, name in reregion.findall(content)]
def parse_region_data(content):
reppl = re.compile('Количество избирателей</strong>([^<]+)<')
reloc = re.compile('тельной комиссии</strong>([^<]+)<')
recon = re.compile('Контакты:</strong>([^<]+)<')
rebor = re.compile('region-border">(.+?)</(?:p|div)>', re.DOTALL)
ppl = reppl.findall(content)[0] # get first (and the only) match
# strip all markup from ' – 65 522.\n'
ppl = ppl.replace(' ', '').strip('\r\n.–')
loc = reloc.findall(content)[0]
loc = loc.replace('\r\n', ' ').strip('\r\n\t:. ')
con = recon.findall(content)[0]
con = re.sub('\s+', ' ', con).strip()
bor = rebor.findall(content)[0]
bor = re.sub('\s+', ' ', bor).strip()
return ppl, loc, con, bor
if __name__ == '__main__':
force = False
if '-f' in sys.argv:
force = True
# directories cache/ and dataset/ needs to be present
content = get_page(URL + '/regions.html', 'cache/regions.html', force)
# number, name, url
regions = parse_region_list(content)
header = 'number name url'.split()
for idx, (number, name, url) in enumerate(regions):
content = get_page(url, f'cache/region{number}.html', force)
ppl, loc, con, border = parse_region_data(content)
# insert people field after the name
nameidx = header.index('name')
regions[idx].insert(nameidx+1, ppl)
regions[idx].insert(nameidx+2, loc)
regions[idx].insert(nameidx+3, con)
regions[idx].append(border)
header.insert(nameidx+1, 'people')
header.insert(nameidx+2, 'location')
header.insert(nameidx+3, 'contact')
header.append('border')
write_csv('dataset/regions.csv', regions, header)