-
Notifications
You must be signed in to change notification settings - Fork 0
/
lfc_bulk_downloader.py
234 lines (196 loc) · 7.49 KB
/
lfc_bulk_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-
"""
NOTES:
- Data fields contain OrderedDicts with the Comments field, a list of comments per Judge
- Convert all OrderedDicts to json strings
- Convert all lists to comma-separated strings
"""
FIELDS = [
"Application #",
"Competition Domain",
"Project Title",
"Organization Name",
"Organization Location",
"Executive Summary",
"Project Description",
"Key Words and Phrases",
"Primary Subject Area",
"Annual Operating Budget",
"Number of Employees",
"Project Website",
"Total Projected Costs",
"Sustainable Development Goals",
"Priority Populations",
"Achievement Level",
"LFC Financial Data",
"LFC Analysis",
"Panel Score",
"Rank",
"Peer Score",
"Panel Rank",
"Panel EVIDENCE-BASED Score Normalized",
"Panel FEASIBLE Score Normalized",
"Panel IMPACTFUL Score Normalized",
"Panel COMMUNITY-INFORMED Score Normalized",
"Panel SCALABLE Score Normalized",
"Panel EQUITABLE Score Normalized",
"Panel ACTIONABLE Score Normalized",
"Panel BOLD Score Normalized",
"Panel INNOVATIVE Score Normalized",
"Panel TRANSFORMATIVE Score Normalized",
"Panel DURABLE Score Normalized",
"Panel DURABLE Judge Data",
"Panel EVIDENCE-BASED Judge Data",
"Panel FEASIBLE Judge Data",
"Panel IMPACTFUL Judge Data",
"Panel COMMUNITY-INFORMED Judge Data",
"Panel SCALABLE Judge Data",
"Panel EQUITABLE Judge Data",
"Panel ACTIONABLE Judge Data",
"Panel BOLD Judge Data",
"Panel INNOVATIVE Judge Data",
"Panel TRANSFORMATIVE Judge Data",
"Peer EVIDENCE-BASED Score Normalized",
"Peer FEASIBLE Score Normalized",
"Peer IMPACTFUL Score Normalized",
"Peer COMMUNITY-INFORMED Score Normalized",
"Peer SCALABLE Score Normalized",
"Peer EQUITABLE Score Normalized",
"Peer ACTIONABLE Score Normalized",
"Peer BOLD Score Normalized",
"Peer INNOVATIVE Score Normalized",
"Peer TRANFORMATIVE Score Normalized",
"Peer Sum of Scores Normalized",
"Peer DURABLE Judge Data",
"Peer EVIDENCE-BASED Judge Data",
"Peer FEASIBLE Judge Data",
"Peer IMPACTIFUL Judge Data",
"Peer COMMUNITY-INFORMED Judge Data",
"Peer SCALABLE Judge Data",
"Peer EQUITABLE Judge Data",
"Peer ACTIONABLE Judge Data",
"Peer BOLD Judge Data",
"Peer INNOVATIVE Judge Data",
"Peer TRANSFORMATIVE Judge Data",
]
import mwclient
import time
import json
import asyncio
import re
import html
import pandas as pd
from collections import OrderedDict
def get_proposals(site, proposals):
df = []
# For each proposal, get all fields
for c, id_ in proposals:
print(f"Adding proposal {id_} from competition {c}")
prop = site.api(
"torquedataconnect",
format="json",
path=f"/competitions/{c}/proposals/{id_}",
)["result"]
# Grab desired fields
tmp = {}
for field in FIELDS:
val = prop.get(field)
# Clean val if it's a data-structure
if isinstance(val, list):
try:
val = ", ".join(val)
except:
val = None
elif isinstance(val, OrderedDict):
if "Score" in field:
try:
val = float(val.get("Raw"))
except:
val = None
else:
val = json.dumps(val)
elif field == "Applicant Tax Identification Number" and val:
val = re.sub("[^0-9]", "", str(val))
if val:
val = int(val)
else:
val = None
elif isinstance(val, str):
val = html.unescape(val)
# Clean rich text data
if len(val) > 25 and field != "Project Website":
val = re.sub(" {2,}", " ", re.sub("(<[^<>]{0,}>)", " ", val))
else:
na = ["Not Applicable", "N/A", "N/a", "n/a"]
if any(val.startswith(s) for s in na):
val = ""
elif any(val == s for s in ["na", "NA", "Na", "None"]):
val = ""
if field == "Competition Domain":
tmp["Competition"] = c
else:
tmp[field] = val
df.append(tmp)
return df
async def main(username, api_key):
site = mwclient.Site("torque.leverforchange.org/", "GlobalView/", scheme="https")
site.login(username, api_key)
t0 = time.time()
# Get all Competition titles
competitions = site.api("torquedataconnect", format="json", path="/competitions")[
"result"
]
print([c for c in competitions])
# Get a list of tuples representing all (competition, id) pairs
proposal_ids = []
for c in competitions:
proposals = site.api(
"torquedataconnect", format="json", path=f"/competitions/{c}/proposals"
)["result"]
pairs = [(c, id_) for id_ in proposals]
proposal_ids.extend(pairs)
proposal_ids = list(set(proposal_ids)) # remove duplicates just in case
# Split of ist of proposals ids
threads = 20
step = len(proposal_ids) // threads
chopped_ids = [proposal_ids[i * step : step * (i + 1) + 1] for i in range(threads)]
"""
tasks = [
asyncio.to_thread(
lambda: get_proposals(site, chopped_ids[i])
) for i in range(len(chopped_ids))
]
For some reason, the asyncio.to_thread() method won't work with generator statements
So unfortunately I had to manually map the list slices into each thread
"""
res = await asyncio.gather(
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[0])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[1])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[2])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[3])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[4])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[5])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[6])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[7])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[8])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[9])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[10])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[11])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[12])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[13])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[14])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[15])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[16])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[17])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[18])),
asyncio.to_thread(lambda: get_proposals(site, chopped_ids[19])),
)
df = [x for r in res for x in r]
t = round(time.time() - t0, 0)
print(f"Finished in {t} seconds")
df = pd.DataFrame(df)
df.to_csv("LFC_Proposals.csv", index=False)
if __name__ == "__main__":
username = str(input("Please enter your MediaWiki username: "))
api_key = str(input("Please enter your MediaWiki API key: "))
asyncio.run(main(username, api_key))