This repository has been archived by the owner on Jul 7, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
DL.py
83 lines (55 loc) · 2.45 KB
/
DL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import dateparser
import datetime
from .bulletin import Bulletin
from bs4 import BeautifulSoup
class Delhi(Bulletin):
def __init__(self, basedir):
statename = 'DL'
super().__init__(basedir, statename)
self.baseurl = 'http://health.delhigovt.nic.in'
self.bulletin_url = self.baseurl + '/wps/wcm/connect/doit_health/Health/Home/Covid19/Bulletin+{}+{}'
self._bulletin_link_regex = re.compile(r'^bulletin.*dated (.*)', re.IGNORECASE)
self._months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
'September', 'October', 'November', 'December']
self._years = ['2020', '2021', '2022']
def get_website_html(self, month, year):
url = self.bulletin_url.format(month, year)
html = self.get_url_html(url)
return html
def get_bulletin_links(self, html, year):
soup = BeautifulSoup(html, 'html.parser')
link_dict = {}
for anchor in soup.find_all('a'):
anchor_text = ' '.join(anchor.text.lower().split())
match = self._bulletin_link_regex.match(anchor_text)
if not match:
continue
date = match.group(1)
dateobj = dateparser.parse(date)
dateobj = datetime.datetime(int(year), dateobj.month, dateobj.day)
datestr = f'{dateobj.year}-{dateobj.month:02d}-{dateobj.day:02d}'
try:
href = anchor['href']
except:
pass
link_dict[datestr] = self.baseurl + href
return link_dict
def run(self):
all_links = {}
for year in self._years:
for month in self._months:
if year == '2020' and month in ['March', 'April', 'May']:
continue
if year == '2020' and month == 'April':
month = 'Apr'
print(f'\t Downloading Delhi bulletins for year {year} and month {month}')
html = self.get_website_html(month, year)
bulletin_links = self.get_bulletin_links(html, year)
self.download_bulletins(bulletin_links)
all_links.update(bulletin_links)
self._save_state_()
return all_links
if __name__ == '__main__':
obj = Delhi('/Users/mayank.agarwal@ibm.com/Documents/projects/covid-19/covid-india-data/localstore')
obj.run()