-
Notifications
You must be signed in to change notification settings - Fork 2
/
Script 5 - FCDO SQL database save.R
199 lines (158 loc) · 8.91 KB
/
Script 5 - FCDO SQL database save.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# --------------------------------------------------------------- #
# Script 5
# Script to split output dataset into 4 relational tables for FCDO SQL database
# --------------------------------------------------------------- #
# Read in collated dataset from previous script
all_projects_tidied <- readRDS("Outputs/all_projects_tidied.rds")
org_names_and_locations <- readRDS("Outputs/org_names_and_locations.rds")
# Connect to ODA RI Projects database on development server
# (need to be connected to DFID VPN)
con_live <- DBI::dbConnect(odbc::odbc(),
Driver = "SQL Server",
Server = "hel-sql-120",
Database = "ODARIProjects",
Trusted_Connection = "True")
# 1) Create master project table ----
project_table <- all_projects_tidied %>%
# remove all fields that can have multiple entries for a project
select(-Funder, -Fund, -iati_id,
-recipient_country,
-lead_org_name, -lead_org_country,
-partner_org_name, -partner_org_country,
-period_start, -period_end,
-last_updated) %>%
# limit character fields to 255 chars
mutate(title = if_else(nchar(title) > 255, substr(title, 1, 255), title),
abstract = if_else(nchar(abstract) > 5000, substr(abstract, 1, 5000), abstract),
subject = if_else(nchar(subject) > 255, substr(subject, 1, 255), subject),
link = if_else(nchar(link) > 255, "", link)) %>%
# remove special characters
mutate(title = str_replace_all(title, "‘|’|´", ""),
title = str_replace_all(title, "´", "'"),
abstract = str_replace_all(abstract, "‘|’", ""),
abstract = str_replace_all(abstract, "´", "'")) %>%
rename(project_id = id) %>%
unique()
# find duplicate project ids
duplicates <- project_table %>%
group_by(project_id) %>%
summarise(n = n()) %>%
filter(n > 1) %>%
inner_join(project_table, by = "project_id")
# remove duplicate rows (rough)
project_table <- project_table[!duplicated(project_table$project_id), ]
saveRDS(project_table, file = "Outputs/project_table.rds")
# 2) Create funder table ----
funder_table <- all_projects_tidied %>%
select(project_id = id, funder = Funder,
fund = Fund, funder_iati_id = iati_id) %>%
unique()
saveRDS(funder_table, file = "Outputs/funder_table.rds")
# 3) Create organisation table ----
organisation_table <- org_names_and_locations %>%
mutate(organisation_country = str_to_lower(organisation_country)) %>%
# cleaning step 1
mutate(organisation_country = str_replace_all(organisation_country, "\\(the\\)", ""), # remove (the)
organisation_country = gsub("[()]", "", organisation_country), # remove all parentheses
organisation_country = str_replace_all(organisation_country, "tanzania, united republic of", "tanzania"),
organisation_country = str_replace_all(organisation_country, "congo the democratic republic of the|drc|democratic republic of congo",
"democratic republic of the congo"),
organisation_country = str_replace_all(organisation_country, "china people's republic of", "china"),
organisation_country = str_replace_all(organisation_country, "democratic people's republic of korea", "democratic people’s republic of korea")) %>%
# cleaning step 2
mutate(organisation_country = case_when(
str_detect(organisation_country, "uk|scotland|wales|united kingdom|england|ireland") & organisation_country != "ukraine" ~ "united kingdom",
str_detect(organisation_country, "usa|united states") ~ "united states",
organisation_country == "us" ~ "united states",
str_detect(organisation_country, "ivoire") ~ "ivory coast",
str_detect(organisation_country, "viet") ~ "vietnam",
str_detect(organisation_country, "lao") ~ "laos",
str_detect(organisation_country, "bolivia") ~ "bolivia",
str_detect(organisation_country, "syria") ~ "syria",
str_detect(organisation_country, "gaza|west bank|palestin") ~ "occupied palestinian territories",
TRUE ~ organisation_country)) %>%
mutate(organisation_country = str_to_title(organisation_country))
saveRDS(organisation_table, file = "Outputs/organisation_table.rds")
# 4) Create country table ----
# Distinguish location and beneficiary countries in main dataset
country_table <- all_projects_tidied %>%
mutate(beneficiary_country = coalesce(recipient_country, "Unknown"),
location_country = paste0(coalesce(lead_org_country, "Unknown"), ", ", coalesce(partner_org_country, "Unknown"))) %>%
select(project_id = id, location_country, beneficiary_country) %>%
unique()
# Convert location vs. beneficiary country data to long format
country_table <- country_table %>%
gather(key = "country_type", value = "Country", -project_id) %>%
mutate(country_type = if_else(country_type == "beneficiary_country", 1, 2),
Country = str_to_lower(Country))
# Clean country names
country_table_cleaned <- country_table %>%
mutate(Country = str_replace_all(Country, "\\(the\\)", ""), # remove (the)
Country = gsub("[()]", "", Country), # remove all parentheses
Country = str_replace_all(Country, "tanzania, united republic of", "tanzania"),
Country = str_replace_all(Country, "congo the democratic republic of the|drc|democratic republic of congo",
"democratic republic of the congo"),
Country = str_replace_all(Country, "china people's republic of", "china"),
Country = str_replace_all(Country, "democratic people's republic of korea", "democratic people’s republic of korea")) %>%
# standardise separators
mutate(Country = gsub("\\s*\\([^\\)]+\\)","", Country))
# Convert dataset to long with one row per country entry
country_table_cleaned <- country_table_cleaned %>%
separate_rows(Country, sep = ",|;", convert = FALSE) %>%
mutate(Country = str_trim(Country)) %>%
unique()
# Further country cleaning
country_table_cleaned <- country_table_cleaned %>%
mutate(Country = str_trim(Country)) %>%
mutate(Country = case_when(
str_detect(Country, "uk|scotland|wales|united kingdom|england|ireland") & Country != "ukraine" ~ "united kingdom",
str_detect(Country, "usa|united states") ~ "united states",
Country == "us" ~ "united states",
str_detect(Country, "ivoire") ~ "ivory coast",
str_detect(Country, "viet") ~ "vietnam",
str_detect(Country, "netherlands") ~ "netherlands",
str_detect(Country, "lao") ~ "laos",
str_detect(Country, "bolivia") ~ "bolivia",
str_detect(Country, "syria") ~ "syria",
str_detect(Country, "gaza|west bank|palestin") ~ "occupied palestinian territories",
TRUE ~ Country)) %>%
unique()
# Replace country with "unknown" if not recognised against Tableau's accepted list
country_table_final <- country_table_cleaned %>%
mutate(Country = if_else(Country %in% dac_lookup$country_name, Country, "unknown")) %>%
mutate(Country = tools::toTitleCase(Country)) %>%
unique()
# Remove unecessary unknowns
country_table_final$row_id <- seq.int(nrow(country_table_final))
# identify records with more than one country for a country_type
identify_multiples <- country_table_final %>%
group_by(project_id, country_type) %>%
summarise(n = n()) %>%
filter(n > 1)
# identify "unknown" records for projects in the dataset above
identify_unknowns_to_delete <- country_table_final %>%
filter(Country == "Unknown") %>%
left_join(identify_multiples, by = c("project_id", "country_type")) %>%
filter(!is.na(n))
# remove these "unknowns" from the country table
country_table_final <- country_table_final %>%
filter(!(row_id %in% identify_unknowns_to_delete$row_id)) %>%
select(-row_id)
# Save datasets for testing
saveRDS(country_table, file = "Outputs/country_table.rds")
saveRDS(country_table_cleaned, file = "Outputs/country_table_cleaned.rds")
saveRDS(country_table_final, file = "Outputs/country_table_final.rds")
# 5) Delete and write data in database table ----
dbSendQuery(con_live, "DELETE FROM [ODARIProjects].[dbo].[Project]")
dbAppendTable(con_live, "Project", project_table, row.names = NULL)
dbSendQuery(con_live, "DELETE FROM [ODARIProjects].[dbo].[Funder]")
dbAppendTable(con_live, "Funder", funder_table, row.names = NULL)
dbSendQuery(con_live, "DELETE FROM [ODARIProjects].[dbo].[Organisation]")
dbAppendTable(con_live, "Organisation", organisation_table, row.names = NULL)
dbSendQuery(con_live, "DELETE FROM [ODARIProjects].[dbo].[Country]")
dbAppendTable(con_live, "Country", country_table_final, row.names = NULL)
# Disconnect from SQL server
dbDisconnect(con_live)
# Clear environment
rm(duplicates, identify_multiples, identify_unknowns_to_delete,
country_table, country_table_cleaned, country_table_final)