Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add parser for official case counts in Ontario, Canada #705

Merged
merged 1 commit into from
May 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ requests = "*"
xlrd = "*"
schemapi = {editable = true,git = "https://github.com/altair-viz/schemapi.git"}
ipython = "*"
pandas

[requires]
python_version = "3.8"
219 changes: 110 additions & 109 deletions data/case-counts/canada/CAN-Ontario.tsv
Original file line number Diff line number Diff line change
@@ -1,109 +1,110 @@
# Data source: https://github.com/ishaberry/Covid19Canada
# Data provenance: Offical government data
# License: none specified
time cases deaths hospitalized icu recovered
2020-01-25 1 0
2020-01-27 2 0
2020-01-31 3 0
2020-02-12 3 0 1
2020-02-13 3 0 1
2020-02-14 3 0 1
2020-02-15 3 0 1
2020-02-16 3 0 1
2020-02-17 3 0 1
2020-02-18 3 0 1
2020-02-19 3 0 1
2020-02-20 3 0 1
2020-02-21 3 0 1
2020-02-22 3 0 1
2020-02-23 4 0 3
2020-02-24 4 0 1
2020-02-25 4 0 1
2020-02-26 5 0 3
2020-02-27 6 0 3
2020-02-28 8 0 3
2020-02-29 11 0 3
2020-03-01 15 0 3
2020-03-02 15 0 1
2020-03-03 20 0 3
2020-03-04 20 0 1
2020-03-05 22 0 4
2020-03-06 28 0 4
2020-03-07 28 0 1
2020-03-08 32 0 4
2020-03-09 35 0 5
2020-03-10 37 0 5
2020-03-11 42 1 5
2020-03-12 59 1 5
2020-03-13 79 1 5
2020-03-14 103 1 5
2020-03-15 145 1 5
2020-03-16 177 1 5
2020-03-17 189 1 5
2020-03-18 214 1 5
2020-03-19 258 2 5
2020-03-20 318 2 5
2020-03-21 377 3 6
2020-03-22 425 6 8
2020-03-23 503 6 8
2020-03-24 588 8 8
2020-03-25 688 13 8
2020-03-26 858 15 8
2020-03-27 993 18 8
2020-03-28 1144 19 8
2020-03-29 1355 23 8
2020-03-30 1706 34 431
2020-03-31 1966 58 534
2020-04-01 2851 71 689
2020-04-02 3288 85 831
2020-04-03 3686 103 1023
2020-04-04 4021 133 1219
2020-04-05 4441 141 1449
2020-04-06 4845 162 1624
2020-04-07 5391 189 1802
2020-04-08 5838 211 2074
2020-04-09 6397 233 2305
2020-04-10 6804 268 2574
2020-04-11 7293 289 2858
2020-04-12 7726 302 3121
2020-04-13 8220 332 3357
2020-04-14 8780 397 3568
2020-04-15 9302 445 3902
2020-04-16 9821 498 4194
2020-04-17 10447 533 4556
2020-04-18 11008 571 4875
2020-04-19 11539 602 5209
2020-04-20 12214 651 5515
2020-04-21 12706 697 5806
2020-04-22 13418 757 6221
2020-04-23 13995 798 6680
2020-04-24 14485 850 7087
2020-04-25 14913 898 7509
2020-04-26 15400 945 8000
2020-04-27 15856 1002 8525
2020-04-28 16314 1050 8964
2020-04-29 16777 1133 9612
2020-04-30 17168 1176 10205
2020-05-01 17720 1237 10825
2020-05-02 18158 1264 11390
2020-05-03 18401 1302 12005
2020-05-04 18950 1426 12505
2020-05-05 19330 1485 12779
2020-05-06 19779 1544 13222
2020-05-07 20260 1591 13569
2020-05-08 20690 1648 13990
2020-05-09 21010 1708 14383
2020-05-10 21325 1726 14772
2020-05-11 21687 1775 15131
2020-05-12 22031 1838 15391
2020-05-13 22388 1870 15845
2020-05-14 22749 1902 16204
2020-05-15 23142 1925 16641
2020-05-16 23515 1959 17020
2020-05-17 23845 1982 17360
2020-05-18 24153 2002 17638
2020-05-19 24655 2017 17898
2020-05-20 25094 2066 18190
2020-05-21 25498 2090 18509
2020-05-22 25995 2112 18767
2020-05-23 26466 2139 19146
time cases deaths hospitalized icu recovered
2020-01-26
2020-01-27
2020-01-28 1
2020-01-30 2
2020-01-31 2
2020-02-03 3
2020-02-04 3
2020-02-05 3
2020-02-06 3
2020-02-08 3
2020-02-10 3
2020-02-11 3
2020-02-12 3 1
2020-02-13 3 1
2020-02-14 3 1
2020-02-17 3 1
2020-02-18 3 1
2020-02-19 3 1
2020-02-20 3 2
2020-02-21 3 3
2020-02-24 4 3
2020-02-25 4 3
2020-02-26 4 3
2020-02-27 5 3
2020-02-28 6 3
2020-03-02 18 3
2020-03-03 20 3
2020-03-04 20 3
2020-03-05 22 4
2020-03-06 26 4
2020-03-07 28 4
2020-03-08 29 4
2020-03-09 35 4
2020-03-10 36 5
2020-03-11 42 5
2020-03-12 59 5
2020-03-13 79 5
2020-03-14 103 5
2020-03-15 145 5
2020-03-16 177 5
2020-03-17 189 1 5
2020-03-18 214 1 5
2020-03-19 258 2 5
2020-03-20 318 2 5
2020-03-21 377 2 6
2020-03-22 425 5 8
2020-03-23 503 6 8
2020-03-24 588 8 8
2020-03-25 688 13 8
2020-03-26 858 15 8
2020-03-27 993 18 8
2020-03-28 1144 19 8
2020-03-29 1355 23 8
2020-03-30 1706 23 431
2020-03-31 1966 33 534
2020-04-01 2392 37 674
2020-04-02 2793 53 405 167 831
2020-04-03 3255 67 462 194 1023
2020-04-04 3630 94 506 196 1219
2020-04-05 4038 119 523 200 1449
2020-04-06 4347 132 589 216 1624
2020-04-07 4726 153 614 233 1802
2020-04-08 5276 174 605 246 2074
2020-04-09 5759 200 632 264 2305
2020-04-10 6237 222 673 260 2574
2020-04-11 6648 253 691 257 2858
2020-04-12 7049 274 738 261 3121
2020-04-13 7470 291 760 263 3357
2020-04-14 7953 334 769 255 3568
2020-04-15 8447 385 795 254 3902
2020-04-16 8961 423 807 248 4194
2020-04-17 9525 478 829 245 4556
2020-04-18 10010 514 828 250 4875
2020-04-19 10578 553 809 247 5209
2020-04-20 11184 584 802 247 5515
2020-04-21 11735 622 859 250 5806
2020-04-22 12245 659 878 243 6221
2020-04-23 12879 713 887 233 6680
2020-04-24 13519 763 910 243 7087
2020-04-25 13995 811 925 245 7509
2020-04-26 14432 835 938 252 8000
2020-04-27 14856 892 945 241 8525
2020-04-28 15381 951 957 239 8964
2020-04-29 15728 996 977 235 9612
2020-04-30 16187 1082 999 233 10205
2020-05-01 16608 1121 1017 225 10825
2020-05-02 17119 1176 977 221 11390
2020-05-03 17553 1216 1010 232 12005
2020-05-04 17923 1300 984 225 12505
2020-05-05 18310 1361 1043 223 12779
2020-05-06 18722 1429 1032 219 13222
2020-05-07 19121 1477 1033 220 13569
2020-05-08 19598 1540 1028 213 13990
2020-05-09 19944 1599 1016 203 14383
2020-05-10 20238 1634 961 195 14772
2020-05-11 20546 1669 1027 194 15131
2020-05-12 20907 1725 1025 192 15391
2020-05-13 21236 1765 1018 189 15845
2020-05-14 21581 1798 1026 184 16204
2020-05-15 21922 1825 986 179 16641
2020-05-16 22313 1858 975 180 17020
2020-05-17 22653 1881 934 171 17360
2020-05-18 22957 1904 972 174 17638
2020-05-19 23384 1919 987 167 17898
2020-05-20 23774 1962 991 160 18190
2020-05-21 24187 1993 984 155 18509
2020-05-22 24628 2021 961 153 18767
2020-05-23 25040 2048 912 147 19146
2020-05-24 25500 2073 878 148 19477
59 changes: 59 additions & 0 deletions data/parsers/canada-ontario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
from typing import Dict

import pandas as pd
import requests

from parsers.utils import sanitize
from paths import BASE_PATH, TSV_DIR

COUNTRY = 'canada'
REGION = 'CAN-Ontario'
URL = 'https://data.ontario.ca/api/3/action/datastore_search?resource_id=ed270bb8-340b-41f9-a7c6-e8ef587e6d11'
DESIRED_PAGE_SIZE = 1000
cols = ['time', 'cases', 'deaths', 'hospitalized', 'icu', 'recovered']
dcols = {
'Reported Date': 'time',
'Total Cases': 'cases',
'Deaths': 'deaths',
'Number of patients hospitalized with COVID-19': 'hospitalized',
'Number of patients in ICU with COVID-19': 'icu',
'Resolved': 'recovered',
Comment on lines +16 to +21
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the mapping of columns

}


def parse():
real_page_size = DESIRED_PAGE_SIZE
url = f'{URL}&offset=0&limit={real_page_size}'
response: Dict = requests.get(url).json()
result = response.get('result')

n_rows = int(result.get('total'))
real_page_size = int(result.get('limit'))
n_pages = int(n_rows / real_page_size) + 1

records = result.get('records')
df = pd.DataFrame.from_dict(records)

for page in range(1, n_pages):
offset = page * real_page_size
response: Dict = requests.get(f'{URL}&offset={offset}&limit={real_page_size}').json()
new_records = response.get('result').get('records')
new_df = pd.DataFrame.from_dict(new_records)
df = df.append(new_df)

df = df[dcols.keys()]
df = df.rename(columns=dcols)
df['time'] = df['time'].apply(pd.to_datetime)
df = df.set_index('time')

assert len(df.index) == n_rows

region = sanitize(REGION)
filepath = f'{BASE_PATH}/{TSV_DIR}/{COUNTRY}/{region}.tsv'
os.makedirs(os.path.dirname(filepath), exist_ok=True)
df.to_csv(filepath, sep='\t', na_rep='', float_format='%i', date_format='%Y-%m-%d')


if __name__ == '__main__':
parse()
5 changes: 4 additions & 1 deletion data/parsers/canada.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def parse():
elif state == 'CAN-PEI':
state = 'CAN-Prince Edward Island'


# Ontario is handled in its own parser `canada.ontario.py`
if state == 'CAN-Ontario':
continue

# Hack: recovered currently has no county-level data.
county = None
# county-level removed as requested in https://github.com/neherlab/covid19_scenarios_data/pull/42#issuecomment-603427339
Expand Down
5 changes: 5 additions & 0 deletions data/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@
"dataProvenance": "Offical government data",
"license": "none specified"
},
"canada-ontario": {
"primarySource": "https://data.ontario.ca/api/3/action/datastore_search?resource_id=ed270bb8-340b-41f9-a7c6-e8ef587e6d11",
"dataProvenance": "Ontario.ca - official website of the Ontario Government. Dataset: \"Confirmed positive cases of COVID19 in Ontario\".",
"license": "Open Government Licence – Ontario (https://www.ontario.ca/page/open-government-licence-ontario)"
},
"unitedstates": {
"primarySource": "https://covidtracking.com/api/states/daily",
"dataProvenance": "The COVID Tracking Project",
Expand Down
Loading