create vulnerability index - MSOA - NI.r

##
## SOA-level vulnerability index for Northern Ireland
##
library(tidyverse)
library(readxl)
library(httr)
library(sf)
library(rmapshaper)

source("functions.r")
source("load lookup tables.r")


###############################################################################
## Load and prep all data sources
##

##
## IMD data
##
# download Super Output Area results
# source: https://www.nisra.gov.uk/statistics/deprivation/northern-ireland-multiple-deprivation-measure-2017-nimdm2017
GET("https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/NIMDM17_SOAresults.xls",
    write_disk(tf <- tempfile(fileext = ".xls")))

imd_health = read_excel(tf, sheet = "Health and Disability")
imd_access = read_excel(tf, sheet = "Access to Services")
imd_env = read_excel(tf, sheet = "Living Environment")
imd_income = read_excel(tf, sheet = "Income")
imd_employment = read_excel(tf, sheet = "Employment")

soa_names = imd_health %>% 
  select(Code = SOA2001, Name = SOA2001_name)

unlink(tf); rm(tf)

# helper function to swap ranks from low to high
reverse_ranks = function(x) (length(x) + 1) - x

clinical_indicators = imd_health %>% 
  select(
    Code = SOA2001, 
    `Standardized ratio of people registered as having cancer (Rank)` = `Standardized ratio of people registered as having cancer (excluding non-melanoma skin cancers)\n(rank)`, 
    `Standardized ratio of people on multiple prescriptions (Rank)` = `Standardized ratio of people on multiple prescriptions on a regular basis\n(rank)`, 
    `Standardized ratio of people with a long-term health problem or disability (Rank)` = `Standardized ratio of people with a long-term health problem or disability\n(Excluding Mental Health problems)\n(rank)`
  ) %>%  
  mutate_if(is.numeric, reverse_ranks)  # reverse ranks so 1 = least deprived
  
wellbeing_indicators = imd_health %>% 
  select(Code = SOA2001, `Combined Mental Health Indicator (Rank)` = `Combined Mental Health Indicator\n(rank)`) %>% 
  mutate_if(is.numeric, reverse_ranks)  # reverse ranks so 1 = least deprived

economic_indicators = imd_income %>% 
  select(Code = SOA2001, `Proportion of low-income population` = `Proportion of the population living in households whose equivalised income is below 60 per cent of the NI median \n(%)`) %>% 
  
  left_join(imd_employment %>% 
              select(Code = SOA2001, `Proportion of employment-deprived population` = `Proportion of the working age population who are employment deprived\n(%)`),
            by = "Code")

social_indicators = imd_access %>% 
  left_join(imd_env, by = "SOA2001") %>% 
  select(
    Code = SOA2001, 
    `Service-weighted fastest travel time by private transport (Rank)` = `Service-weighted fastest travel time by private transport\n(rank)`, 
    # `Proportion of properties with broadband speed below 10Mb/s (Rank)` = `Proportion of properties with broadband speed below 10Mb/s\n(rank)`, 
    `Housing Quality (Rank)` = `Housing Quality \nSub-Domain \n(Rank)`, 
    `Housing Access (Rank)` = `Housing Acces Sub-Domain (Rank)`,
    `Outdoor physical environment Sub-Domain (Rank)` = `Outdoor physical environment Sub-Domain (Rank)`
  ) %>% 
  mutate_if(is.numeric, reverse_ranks)  # reverse ranks so 1 = least deprived


##
## Load population estimates
##
pop_older = read_csv("data/population estimates msoa11 - older people.csv") %>% 
  filter(str_sub(Code, 1, 1) == "9")


##
## Load loneliness risks
## source: https://github.com/matthewgthomas/loneliness
##
loneliness = read_csv("https://github.com/matthewgthomas/loneliness/raw/master/NI/msoa_loneliness.csv") %>% 
  select(Code = SOA_CODE, Loneliness = loneills_2018)


##
## Load digital exclusion
##
digital_exclusion = read_csv("data/CACI/digital-exclusion-lsoa.csv")  # use LSOA data because NI doesn't have MSOAs

##
## financial vulnerability (from CACI)
##
fin_lsoa = read_csv("data/CACI/financial-lsoa.csv")
fin_lsoa = fin_lsoa %>% select(Code = LSOA11CD, `Financial Vulnerability score`)

##
## Green spaces in SOAs
## source: Brian Johnston (Queens University Belfast
## (currently only for Belfast so will leave out for now)
##
# green = read_csv("data/green-area-NI.csv")


###############################################################################

# ---- Calculate scores for each domain of vulnerability ----
# Methodology:
# 1. Combine data to create domains (e.g., clinical, health and wellbeing, etc.)
# 2. Create weighted domain scores for each MSOA using unsupervised learning.
#    See weighted_domain_score() function in functions.r file for more details
#    on the methodology used.

# ---- Clinical vulnerability domain ----
# Use Maximum Likelihood Factor Analysis (MLFA) to determine weights
# as it is assumed indicators have a relationship
clinical = clinical_indicators %>% 
  left_join(pop_older, by = "Code") %>% 
  
  # Use MLFA model
  weighted_domain_scores(model = "MLFA",
                         domain = "Clinical")

# ---- Health and wellbeing domain ----
# Use Principle Components Factor Analysis (PCA) to determine weights
# as it is assumed indicators do not have a clear relationship
health_wellbeing = wellbeing_indicators %>% 
  left_join(loneliness, by = "Code") %>% 
  
  # Use PCA model
  weighted_domain_scores(model = "PCA",
                         domain = "Health/Wellbeing")

# ---- Economic vulnerability domain ----
# Use Principle Components Factor Analysis (PCA) to determine weights
# as it is assumed indicators do not have a clear relationship
economic = economic_indicators %>% 
  left_join(fin_lsoa, by = "Code") %>% 
  # Use PCA model
  weighted_domain_scores(model = "PCA",
                         domain = "Economic")

# ---- Social vulnerability domain ----
# Split the domain into four separate subdomains - access, housing, environmental and digital exclusion.
# The reason for splitting is there are three distinct subsets of indicators within the domain.
# Use MLFA to determine the weights for the housing and access subdomains and set air quality to have
# a weight of 1/4 of the overall domain.

# Calculate access vulnerability score
social_access <- social_indicators %>% 
  select(Code, `Service-weighted fastest travel time by private transport (Rank)`) %>% 
  mutate_if(is.numeric,
            list(`Access Vulnerability score` = function(x) standardised(x)))

# Calculate housing vulnerability score
social_housing <- social_indicators %>% 
  select(Code, `Housing Quality (Rank)`, `Housing Access (Rank)`) %>% 
  
  weighted_domain_scores(model = "PCA",
                         domain = "Housing") %>% 
  select(-`Housing Vulnerability rank`,
         -`Housing Vulnerability decile`)

# Calculate environmental vulnerability score. 
social_env <- social_indicators %>% 
  select(Code, `Outdoor physical environment Sub-Domain (Rank)`) %>% 
  
  mutate_if(is.numeric,
            list(`Env Vulnerability score` = function(x) standardised(x)))

# get digital vulnerability score and rank
social_digital <- digital_exclusion %>% 
  select(Code = LSOA11CD, `Digital Vulnerability score`, `Digital Vulnerability rank`)

# Combine subdomains
social <- social_access %>% 
  left_join(social_housing, by = "Code") %>% 
  left_join(social_env, by = "Code") %>% 
  left_join(social_digital, by = "Code")

# Calculate social domain score
social <- social %>% 
  mutate(`Access Vulnerability score` = `Access Vulnerability score`* (1/4),
         `Housing Vulnerability score` =  `Housing Vulnerability score` * (1/4),
         `Env Vulnerability score` = `Env Vulnerability score` * (1/4),
         `Digital Vulnerability score` = `Digital Vulnerability score` * (1/4)) %>%
  
  mutate(`Social Vulnerability score` = reduce(select(., ends_with("Vulnerability score")), `+`)) %>% 
  mutate(`Social Vulnerability rank` = rank(`Social Vulnerability score`)) %>%
  mutate(`Social Vulnerability decile` = calc_risk_quantiles(`Social Vulnerability rank`, quants = 10)) %>% 
  select(-`Access Vulnerability score`, -`Housing Vulnerability score`, -`Env Vulnerability score`, -`Digital Vulnerability score`)


###############################################################################
## Construct the overall MSOA-level vulnerability index
##
vulnerability = clinical %>% 
  # keep only vulnerability ranks for each domain
  select(Code, `Clinical Vulnerability rank`) %>% 
  
  left_join(health_wellbeing %>% select(Code, `Health/Wellbeing Vulnerability rank`),
            by = "Code") %>% 
  
  left_join(economic %>% select(Code, `Economic Vulnerability rank`),
            by = "Code") %>% 
  
  left_join(social %>% select(Code, `Social Vulnerability rank`),
            by = "Code") %>% 
  
  calc_domain_scores(rank.indicators = FALSE) %>% 
  
  select(Code, starts_with("Vulnerability"))

# ---- Construct a socioeconomic vulnerability index ----
vulnerability_se <- clinical %>%
  
  # keep only vulnerability ranks for each domain
  select(Code, `Clinical Vulnerability rank`) %>%
  
  left_join(health_wellbeing %>% select(Code, `Health/Wellbeing Vulnerability rank`),
            by = "Code") %>%
  
  left_join(economic %>% select(Code, ends_with("Vulnerability rank")),
            by = "Code") %>%
  
  left_join(social %>% select(Code, `Social Vulnerability rank`),
            by = "Code") %>%
  
  calc_domain_scores(domain = "Socioeconomic", rank.indicators = FALSE, clinical_weight = 0, health_weight = 0, economic_weight = 0.5, social_weight = 0.5) %>%
  
  select(Code, starts_with("Socioeconomic"))

##
## combine all domains and overall scores into one dataframe and save
##
vulnerability_all = clinical %>% 
  left_join(health_wellbeing, by = "Code") %>% 
  left_join(economic, by = "Code") %>% 
  left_join(social, by = "Code") %>% 
  left_join(vulnerability_se, by = "Code") %>% 
  left_join(vulnerability, by = "Code") %>% 
  
  # get MSOA names
  left_join(soa_names, by = "Code") %>% 
  
  select(Code, Name, everything())

# save
vulnerability_all %>% write_csv("output/vulnerability-SOA-NI.csv")

# save a copy with only vulnerability scores
vulnerability_all %>% 
  select(Code, Name, contains("Vulnerability")) %>% 
  write_csv("output/vulnerability-scores-SOA-NI.csv")


##
## save as geojson
##
# Super Output Areas from https://www.opendatani.gov.uk/dataset/nisra-open-data-boundaries-super-output-areas-2011/resource/80392e82-8bee-42de-a1e3-82d1cbaa983f
# soa = read_sf("https://cc-p-ni.ckan.io/dataset/678697e1-ae71-41f3-abba-0ef5f3f352c2/resource/80392e82-8bee-42de-a1e3-82d1cbaa983f/download/soa2001.json") %>% 
soa = read_sf("data/boundaries/SOA2011.shp") %>% 
  st_transform(crs = 4326) %>% 
  ms_simplify(keep_shapes = T)

soa_vi = soa %>% 
  left_join(vulnerability_all, by = c("SOA_CODE" = "Code")) %>% 
  mutate(`Socioeconomic Vulnerability quintile` = calc_risk_quantiles(`Socioeconomic Vulnerability rank`, quants = 5),
         `Vulnerability quintile` = calc_risk_quantiles(`Vulnerability rank`, quants = 5))

geojson_name = "output/vulnerability-SOA-NI.geojson"
if (file.exists(geojson_name)) file.remove(geojson_name)

soa_vi %>% 
  rename(Code = SOA_CODE) %>% 
  # append a note to deciles and ranks
  rename_at(vars(matches("decile|rank")), ~ str_c(., " (1 = least vulnerable)")) %>% 
  
  select(-SOA_LABEL) %>% 
  write_sf(geojson_name)


##
## checking and debugging
##
# get most and least vulnerable MSOAs to check their underlying indicators look correct (they do)
test = vulnerability_all %>% filter(`Vulnerability rank` %in% c(1, nrow(vulnerability_all)))

# check relationship between socioeconomic vulnerability and overall vulnerability; cor = 0.8833191
vulnerability_all %>% ggplot(aes(x = `Vulnerability rank`, `Socioeconomic Vulnerability rank`)) + geom_point(alpha = 0.1) + geom_smooth()
cor.test(~ `Vulnerability rank` + `Socioeconomic Vulnerability rank`, data = vulnerability_all)