-
Notifications
You must be signed in to change notification settings - Fork 0
/
Part 1 & 2
506 lines (373 loc) · 19.4 KB
/
Part 1 & 2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
# Importing packages
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import psycopg2
import pandas as pd
import random
import seaborn as sns
import statsmodels.formula.api as smf
from IPython.display import Image
# Connecting to covid19db.org
conn = psycopg2.connect(
host='covid19db.org',
port=5432,
dbname='covid19',
user='covid19',
password='covid19')
cur = conn.cursor()
#Functions
def cases_by_govresp(govresp,data):
'''The mean total confirmed cases per million at specified government response index'''
return data[abs(data["gov_resp"]-govresp)<1]["total confirmed/million"].mean()
def random_low_high_countries(data):
'''Returns six random countries; three with lower initial responses and three with higher.
These countries are more than half a standard deviation away from the mean'''
i_response= data["initial_response"]
low= i_response.mean()-(i_response.std()/2)
high= i_response.mean()+(i_response.std()/2)
low_countries= list(data[data["initial_response"]<low].country.unique())
high_countries= list(data[data["initial_response"]>high].country.unique())
random_lowitems = random.sample(low_countries, 3)
random_highitems = random.sample(high_countries, 3)
low_high= random_lowitems + random_highitems
return low_high
# Collecting country-level data
sql_command = """SELECT date,country,countrycode,confirmed,dead FROM epidemiology WHERE
adm_area_1 IS NULL AND
adm_area_2 IS NULL AND
adm_area_3 IS NULL AND
source= 'WRD_WHO'
ORDER BY
date"""
df_epidemiology = pd.read_sql(sql_command, conn)
df_epidemiology["date"]=pd.to_datetime(df_epidemiology['date'], format='%Y-%m-%d')
df_epidemiology["confirmed"]=df_epidemiology["confirmed"].astype(int)
df_epidemiology["dead"]=df_epidemiology["dead"].astype(int)
# Creating a new column for daily confirmed cases
for country in df_epidemiology.country.unique():
mask = df_epidemiology["country"]==country
df_epidemiology.loc[mask,"confirmed/day"] = df_epidemiology[mask]['confirmed'].diff()
df_epidemiology.loc[mask,"dead/day"] = df_epidemiology[mask]['dead'].diff()
df_epidemiology['confirmed/day'].fillna(0,inplace=True)
df_epidemiology['dead/day'].fillna(0,inplace=True)
sql_command = """SELECT value, country, countrycode FROM world_bank WHERE
indicator_name = 'Population, total' AND
year=2019"""
df_pop = pd.read_sql(sql_command, conn)
df_pop.rename(columns={"value": "population"},inplace=True)
df_pop["population(m)"]=df_pop["population"]/1000000
# Merging with epidemiology data
df_epi=pd.merge(df_epidemiology,df_pop,on=["country","countrycode"], how='left')
# Exluding countries without population data
df_epi.dropna(inplace=True)
# Creating new columns for epidemiology data normalized by population
df_epi["confirmed/million"]= df_epi["confirmed/day"]/df_epi["population(m)"]
df_epi["dead/million"]= df_epi["dead/day"]/df_epi["population(m)"]
df_epi["total confirmed/million"]= df_epi["confirmed"]/df_epi["population(m)"]
df_epi["total dead/million"]= df_epi["dead"]/df_epi["population(m)"]
sql_command ="""SELECT date, country, countrycode, government_response_index_for_display
FROM government_response
WHERE adm_area_1 IS NULL
ORDER BY date """
df_govtrack= pd.read_sql(sql_command, conn)
df_govtrack.rename(columns={"government_response_index_for_display": "gov_resp"},inplace=True)
df_govtrack["date"]=pd.to_datetime(df_govtrack['date'], format='%Y-%m-%d')
df_govtrack.dropna(inplace=True)
#Merge with epidemiology data
gov_epi = pd.merge(df_govtrack, df_epi, on=['country','countrycode','date'])#, right_on = ['country','countrycode','date'])
# E037: The question on Government Responsibility
questions=["E037"]
wave = "2008-2010"
sql_command = """SELECT * FROM surveys WHERE wave=%(wave)s AND adm_area_1 IS NULL AND adm_area_2 IS NULL AND adm_area_3 IS NULL"""
df_surveys = pd.read_sql(sql_command, conn, params={'wave': wave})
# Cleaning Data
survey= pd.DataFrame()
survey["country"]=np.zeros(len(df_surveys))
survey["countrycode"]=np.zeros(len(df_surveys))
for i in range(len(df_surveys)):
survey["country"].loc[i]=df_surveys["country"].loc[i]
survey["countrycode"].loc[i]=df_surveys["countrycode"].loc[i]
for j in questions:
title= df_surveys.properties[i][j]["Label"]
liste=[]
for k,v in df_surveys.properties[i][j]["Frequencies"].items():
if k[-2]!="-":
varie= int(k[-2:].replace("_",""))*v
liste.append(varie)
survey.loc[i,title]=sum(liste)
# Initial responses for the first hundred cases
aggr= {"gov_resp":[np.ptp,min,max], "date":np.ptp, "confirmed":np.ptp}
initial=gov_epi[(gov_epi["confirmed"]>0.1)&(gov_epi["confirmed"]<100)].groupby("country").agg(aggr)
initial.reset_index(inplace=True)
initial["daily case rate"]=initial["confirmed"]["ptp"]/initial["date"]["ptp"].dt.days
initial["initial_response"]=(initial["gov_resp"]["min"]+1)*(initial["gov_resp"]["ptp"]+1)/(initial["daily case rate"]+1)
# Merging the responses with survey data
response_data= initial[["country","initial_response"]].droplevel(1,axis=1)
all_df= pd.merge(survey,response_data,on="country")
all_df.rename(columns={"Government responsibility":"Government_responsibility"},inplace=True)
# Regression model for initial responses and government responsibility
form_1 = 'initial_response ~ Government_responsibility'
fit_1 = smf.ols(formula = form_1, data = all_df).fit()
# Daily Confirmed Cases per Million
def figure_1():
subset= ['Luxembourg', 'Estonia', 'Belarus','Italy','Romania','Russia', 'Turkey', 'Portugal']
color_list = ["r","b","g","c","m","y","tab:orange","tab:purple"]
epi=df_epi.groupby('country').resample('W-MON', on='date').mean().reset_index().sort_values(by='date')
i=0
plt.figure(figsize=(10,10))
for country in subset:
mask= epi["country"]==country
plt.plot(epi[mask]["date"],epi[mask]['confirmed/million'],linestyle='-', color=color_list[i])
plt.legend(subset)
i+=1
plt.ylabel("Daily confirmed cases per million")
plt.title("Figure 1: Daily Confirmed Cases per Million",fontsize=25)
plt.savefig("Figure_1.png")
plt.show()
# Daily Confirmed Cases per Million
def figure_1():
subset= ['Luxembourg', 'Estonia', 'Belarus','Italy','Romania','Russia', 'Turkey', 'Portugal']
color_list = ["r","b","g","c","m","y","tab:orange","tab:purple"]
epi=df_epi.groupby('country').resample('W-MON', on='date').mean().reset_index().sort_values(by='date')
i=0
plt.figure(figsize=(10,10))
for country in subset:
mask= epi["country"]==country
plt.plot(epi[mask]["date"],epi[mask]['confirmed/million'],linestyle='-', color=color_list[i])
plt.legend(subset)
i+=1
plt.ylabel("Daily confirmed cases per million")
plt.title("Figure 1: Daily Confirmed Cases per Million",fontsize=25)
plt.savefig("Figure_1.png")
plt.show()
# Initial Responses of Several Countries
def figure_2():
countries= random_low_high_countries(initial)
fig, axs = plt.subplots(2, 3,figsize=(15,15),sharey=True)
fig.suptitle('Figure 2: Initial Responses of Several Countries', fontsize=25)
fig.subplots_adjust(wspace=0.23,hspace=0.3)
rows = ['Lower initial response score','Higher initial response score']
for ax, row in zip(axs[:,0], rows):
ax.set_ylabel(row, rotation=90, fontsize=25,color='xkcd:dark maroon')
for ax in axs[0,:]:
ax.set_facecolor('xkcd:ice blue')
for ax in axs[1,:]:
ax.set_facecolor('xkcd:pale lilac')
filtered_data=gov_epi[(gov_epi["confirmed"]<100)].sort_values("date")
for idx, country in enumerate(countries):
data = filtered_data[filtered_data['country'] == country]
date = data['date']
total_confirmed = data['confirmed']
gov_response = data['gov_resp']
argmin = total_confirmed.ne(0).idxmax()
axs[idx // 3, idx % 3].plot(date, total_confirmed, label="Total Confirmed", color="xkcd:prussian blue")
first_case = (date[argmin], int(total_confirmed[argmin]))
axs[idx // 3, idx % 3].plot([first_case[0]], [first_case[1]], marker="*", ls="", c="green", markersize=10)
axs[idx // 3, idx % 3].annotate("First case", (first_case[0]-dt.timedelta(days=5), first_case[1]-6),color="green")
axis = axs[idx // 3, idx % 3].twinx()
axis.set_ylim([-3,70])
axis.plot(date, gov_response, label="Gov Resp", color="xkcd:dirty orange")
axis.tick_params(axis='y', labelcolor="xkcd:dirty orange")
lines, labels = axs[idx // 3, idx % 3].get_legend_handles_labels()
lines2, labels2 = axis.get_legend_handles_labels()
axis.legend(lines + lines2, labels + labels2, loc='upper right')
axs[idx // 3, idx % 3].set_title(country.upper(),fontsize=15,color="xkcd:dark maroon")
axs[idx // 3, idx % 3].tick_params(axis='y', labelcolor="xkcd:prussian blue")
axs[idx // 3, idx % 3].tick_params(labelrotation=35)
axs[idx // 3, idx % 3].yaxis.set_tick_params(which='both', labelbottom=True)
plt.savefig("Figure 2.png")
plt.show()
# Regression model for initial response and government responsibility
def figure_3():
g = sns.lmplot(x="Government_responsibility", y="initial_response", data=all_df)
g.fig.suptitle("Figure 3: Regression Model for Government Responsibility and Initial Responses by Country",
fontsize=15, fontdict={"weight": "bold"})
g.set(xlabel='Government Responsibility', ylabel='Initial Response')
plt.savefig("Figure 3.png",bbox_inches='tight')
plt.show()
# Responses at the date of the first positive cases
non_zero= gov_epi[gov_epi["confirmed"]!= 0]
i = non_zero.groupby("country")["confirmed"].idxmin()
first_case=non_zero.loc[i]
table_1 = pd.DataFrame([],columns=["Mean","Std Dev.","Min.","Max."],
index=["Government Response"])
table_1.loc["Government Response"]= [first_case["gov_resp"].mean(),first_case["gov_resp"].std(),first_case["gov_resp"].min(),
first_case["gov_resp"].max()]
# World Values Survey "Government Responsibility" answers
table_2 = pd.DataFrame([],columns=["Mean","Std Dev.","Min.","Max."],
index=["Government Responsibility"])
table_2.loc["Government Responsibility"]= [survey["Government responsibility"].mean(),survey["Government responsibility"].std(),
survey["Government responsibility"].min(),survey["Government responsibility"].max()]
# Countries with minimum and maximum values
minim = survey["Government responsibility"].idxmin()
maxim = survey["Government responsibility"].idxmax()
min_country=survey.loc[minim,"country"]
max_country=survey.loc[maxim,"country"]
# Important Parameters for the Regression Model for initial response and government responsibility
table_3 = pd.DataFrame([],columns=["R squared","Coefficient for Gov. Resp.","p-value for Gov. Resp."],
index=["Regression Model"])
table_3.loc["Regression Model"]= [fit_1.rsquared,fit_1.params[1],fit_1.pvalues[1]]
#Code for Part 2
# Importing packages
import apikeys
import requests
import bs4
import time
from scipy import stats
# Head of States
DOMAIN = "https://en.wikipedia.org"
s = requests.Session()
s.headers.update({'User-agent': 'Python/Elmira wikicrawl0.2'})
article = "List_of_current_heads_of_state_and_government"
req = s.get(f"{DOMAIN}/wiki/{article}")
soup = bs4.BeautifulSoup(req.content, 'html.parser')
indiatable=soup.find('table',{'wikitable plainrowheaders'})
pre=pd.read_html(str(indiatable))
pre=pd.DataFrame(pre[0])
# Only the countries with World Values Survey data
for i in range(len(pre)):
if pre.loc[i,"State"] not in survey["country"].unique():
pre.drop([i],inplace=True)
pre.reset_index(inplace=True,drop=True)
pre.drop(columns="Head of government",inplace=True)
# Drop repetative rows
pre.drop([5,6,8,12,19,43,44,46],inplace=True)
pre.reset_index(inplace=True,drop=True)
twitter_accounts= np.array(['ilirmetazyrtar','Arm_President','vanderbellen', 0,0,0,'PresidentOfBg',0,
'AnastasiadesCY',0,0,'AlarKaris','niinisto','EmmanuelMacron',0,
0,0,0,'PresidentISL','PresidentIRL','Quirinale','valstsgriba','GitanasNauseda',0,'presidentmt',
'sandumaiamd','predsjednik_cg',0,0,'AndrzejDuda','presidencia','KlausIohannis','KremlinRussia_E',
'avucic','ZuzanaCaputova','BorutPahor',0, 0,'ParmelinG','RTErdogan','ZelenskyyUa',0])
# Presidents' twitter usernames as new column
pre["accounts"]=twitter_accounts
pre.reset_index(inplace=True, drop=True)
# Merging with World Values survey data
pres=pd.merge(pre,survey,left_on="State",right_on="country")
# Drop monarchs
pres.drop([4,10,23,27,28,36,37,41],inplace=True)
pres.reset_index(inplace=True)
# New Column for categorizing those with active accounts and those without
for i in range(len(pres)):
if pres.loc[i,"accounts"]=="0":
pres.loc[i,"Active Twitter Account"]= "No"
else:
pres.loc[i,"Active Twitter Account"]="Yes"
# Selecting those with active Twitter accounts
actives= pre.copy()
for i in range(len(actives)):
if '0' in actives.loc[i,"accounts"]:
actives.drop([i],inplace=True)
actives.reset_index(inplace=True,drop=True)
# Retrieving id numbers and number of followers with usernames
ids=pd.DataFrame()
headers = {"Authorization": "Bearer {}".format(apikeys.BEARER_TOKEN)}
for account in actives["accounts"].unique():
url = f"https://api.twitter.com/2/users/by?usernames={account}&tweet.fields=author_id&user.fields=public_metrics"
response = requests.request("GET", url, headers=headers)
tweets = response.json()
tweets=pd.json_normalize(tweets["data"])
ids=ids.append(tweets)
ids.reset_index(drop=True, inplace=True)
time.sleep(2)
ids.rename(columns={"public_metrics.followers_count":"followers"},inplace=True)
ids=ids[["id","username","followers"]]
# Merge for the country names
ids_pre=pd.merge(ids,pre,how="left",left_on="username",right_on="accounts")
# Retrieving last 100 mentions with id numbers
tweets_all=pd.DataFrame()
n = datetime.datetime.now(dt.timezone.utc)
end_time = n.isoformat()
headers = {"Authorization": "Bearer {}".format(apikeys.BEARER_TOKEN)}
for i in ids["id"].unique():
url=f"https://api.twitter.com/2/users/{i}/mentions"
params={
"end_time":end_time,
"max_results":100,
"tweet.fields":"created_at,lang,public_metrics,text",
"expansions":"author_id",
"user.fields":"created_at,description,entities,id,name,public_metrics,username"}
response = requests.request("GET", url, headers=headers,params=params)
tweets = response.json()
tweet=pd.json_normalize(tweets["data"])
tweet["users"]=i
tweets_all=tweets_all.append(tweet)
tweets_all.reset_index(drop=True, inplace=True)
# Merging and sorting
tweets_df=pd.merge(tweets_all,ids, right_on=["id","username"],left_on=["users","username"])
tweets_df['created_at']=pd.to_datetime(tweets_df['created_at'])
tweets_df=tweets_df.sort_values(by=['created_at'])
# Calculating the frequency of mentions
means= tweets_df.groupby('username')['created_at'].apply(lambda x: x.diff().mean())
means=means.to_frame()
means.reset_index(inplace=True)
for i in range(len(means)):
means.loc[i,"seconds"]= means.loc[i,"created_at"].total_seconds()
# Merge to collect all information together
mean=pd.merge(means,ids_pre[["username","State","followers"]],how="left",left_on="username",right_on="username")
mean.rename(columns={"State":"country"},inplace=True)
# Merge with population data
mean_pop=pd.merge(mean,df_pop, on="country")
# Analysis
# Calculate mention score
mean["mention_score"]=1/(mean["followers"]*mean["seconds"])
# Merge with World Values Survey data
mean_survey=pd.merge(mean,survey,on="country")
# Transforming to log-scale
mean_survey["ln_mention"]=np.log(mean_survey["mention_score"])
# Regression model for mention score and government responsibility
mean_survey.rename(columns={"Government responsibility":"Government_responsibility"},inplace=True)
form_3 = 'ln_mention ~ Government_responsibility'
fit_3 = smf.ols(formula = form_3, data = mean_survey).fit()
#Figures
# Regression model with log-log scale
def figure_4():
mean_pop["ln_pop"]=np.log(mean_pop["population(m)"])
mean_pop["ln_seconds"]=np.log(mean_pop["seconds"])
g = sns.lmplot(x="ln_pop", y="ln_seconds", data=mean_pop)
g.fig.suptitle("Figure 4: Regression Model for Frequency of Mentions and Population in millions",
fontsize=15, fontdict={"weight": "bold"})
g.set(xlabel='Log(Population(m))', ylabel='Log(Average seconds between consecutive mentions)')
plt.savefig("Figure 4.png",bbox_inches='tight')
plt.show()
# Violin plot for presidents with an active Twitter account and those without one
def figure_5():
g = sns.violinplot(y=pres["Government responsibility"],x=pres["Active Twitter Account"])
g.set_title("Figure 5: Differences of Having an Active Twitter Account",
fontsize=15, fontdict={"weight": "bold"})
g.set(xlabel='Active Twitter Account', ylabel='Government responsibility')
plt.savefig("Figure 5.png",bbox_inches='tight')
plt.show()
# Regression model for mention score and government responsibility
def figure_6():
g = g = sns.lmplot(x="Government_responsibility",y="ln_mention",data=mean_survey)
g.fig.suptitle("Figure 6: Regression Model for Government Responsibility and Mention Scores by Country",
fontsize=15, fontdict={"weight": "bold"})
g.set(xlabel='Government Responsibility', ylabel='Log(Mention Score)')
plt.savefig("Figure 6.png",bbox_inches='tight')
plt.show()
# Tables
# Summary statistics for frequency and followers
table_4 = pd.DataFrame([],columns=["Mean","Std Dev.","Min.","Max."],index=["Seconds","Followers"])
table_4.loc["Seconds"]= [mean["seconds"].mean(),mean["seconds"].std(),mean["seconds"].min(),mean["seconds"].max()]
table_4.loc["Followers"]= [mean["followers"].mean(),mean["followers"].std(),mean["followers"].min(),mean["followers"].max()]
# Countries with minimum and maximum values
min_fo = mean["followers"].idxmin()
min_freq = mean["seconds"].idxmax()
min_followers=mean.loc[min_fo,"country"]
min_freq=mean.loc[min_freq,"country"]
# Important parameters for the Regression Model of seconds and population
form_2 = 'ln_seconds ~ ln_pop'
fit_2 = smf.ols(formula = form_2, data = mean_pop).fit()
table_5 = pd.DataFrame([],columns=["R squared","Coefficient"],
index=["Regression Model"])
table_5.loc["Regression Model"]= [fit_2.rsquared,fit_2.params[1]]
# T-test for comparing the differences of having an active Twitter account or not
test=stats.ttest_ind(pres["Government responsibility"][pres["Active Twitter Account"]=="Yes"],
pres["Government responsibility"][pres["Active Twitter Account"]=="No"])
p_value=test[1]
# Important Parameters for the Regression Model for mention score and government responsibility
table_7 = pd.DataFrame([],columns=["R squared","Coefficient for Gov. Resp.","p-value for Gov. Resp."],
index=["Regression Model"])
table_7.loc["Regression Model"]= [fit_3.rsquared,fit_3.params[1],fit_3.pvalues[1]]