-
Notifications
You must be signed in to change notification settings - Fork 0
/
population_lookups_CA_level.R
241 lines (176 loc) · 8.71 KB
/
population_lookups_CA_level.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
### 1. Analyst notes ----
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This script creates population lookups using council area (CA) population estimates
# NRS typically release CA estimates first, and then small area population estimates (SAPE) a few months later
# However, there has been a significant delay in the release of 2022 SAPEs
# This script can therefore be run to create population estimates at CA, HB, HSCP, ADP and Scotland level whilst waiting on SAPEs to be released
# This allows a number of indicators to be updated where population estimates are used as the denominator,
# and where they are not released at intermediate zone or locality level (i.e. dependent on SAPEs)
# This is because we have different types of population lookups
# (i.e. datazone level lookups for calculating all areatypes including HSC localities and IZs
# and higher geography level lookups containing only CA/HB/ADP/HSCP/Scotland)
# there are therefore a number of indicators where the population lookup passed to the analysis functions are those higher geo level lookups:
# Indicators which can be updated prior to release of SAPEs:-
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (these ones we can extract the data ourselves)
# COPD incidence
# COPD deaths
# Alcohol-related hospital admissions , aged 11-25 years
# Children hospitalised due to asthma, aged 0-15 years
# Deaths from suicide, female
# Deaths from suicide, male
# Deaths from suicide, aged 11-25 years
# Deaths in children, aged 1-15
# drug related hospital admissions
# Drug-related hospital admissions
# Lung cancer deaths
# Lung cancer registrations
# Unintentional unjuries inunder 5s
# Young people admitted to hospital due to assault
# (these ones require data requests)
# Drug-related deaths
# Drug-related deaths, females
# Drug-related deaths, males
# Smoking Attributable admissions
# Smoking Attrbutable deaths
# Availability of smoking cessation products
# Personal licences in force
# Premise licences in force - Off trade
# Premise licences in force - On trade
# Premise licences in force - Total
# (these ones we save ourselves from publications)
# Child protection with parental alcohol misuse
# Child protection with parental drug misuse
# Child protection with parental drug or alcohol misuse
# Children looked after by local authority
# Children on the child protection register
# Children referred to the Children's Reporter for care and protection
# Children referred to the Children's Reporter for offences
# Primary school children
# Secondary school children
# Lookups which need updated to update these indicators:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# note: versions ending in _SR contain age and sex splits for calculating standardised rates
# CA_pop_allages_SR
# CA_pop_16+_SR
# CA_pop_11to25_SR
# CA_pop_11to25
# CA_pop_12+
# CA_pop_under18
# CA_pop_1to15
# CA_pop_under16_SR
# CA_pop_15to25_SR
# CA_pop_under5_SR
# ~~~~~~~~~~~~~~~~~~~~~~~~
# 2. Housekeeping ----
# ~~~~~~~~~~~~~~~~~~~~~~~~
## packages ----
library(dplyr)
library(tidyr)
## filepaths ----
cl_out <- "/conf/linkage/output/lookups/Unicode/Populations/Estimates/" #cl-out folderwith new 2022 estimates
scotpho_lookups_folder <- "/PHI_conf/ScotPHO/Profiles/Data/Lookups/" # scotpho folder where geography and population lookups are saved
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# 3. Lookups ----
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
## geography lookup
geo_lookup<- readRDS(paste0(scotpho_lookups_folder, "Geography/DataZone11_All_Geographies_Lookup.rds")) |>
select(ca2019, hscp2019, hb2019, adp) |>
unique()
## new council area population estimates
CA_estimates_raw<- readRDS(paste0(cl_out, "CA2019_pop_est_1981_2022.rds")) |>
filter(year >= 2002)
# ~~~~~~~~~~~~~~~~~~~~~~
### 4. Functions ----
# ~~~~~~~~~~~~~~~~~~~~~~~
# function to create population lookups for different age groups
create_population_lookups <- function(lower_age, upper_age, basefile = CA_estimates_raw, geography_lookup = geo_lookup, name){
# step 1: filter by age
# note this needs to be done first before creating age group column
# so that for example, if trying to create a population 16+ file, the age 15 is removed from the '15-19' grouping
CA_estimates <- basefile |>
filter(age >= lower_age & age <= upper_age)
# step 2: create age group column required for calculating standardised rates
CA_estimates <- CA_estimates |>
mutate(age_grp = case_when(
age < 5 ~ 1, age > 4 & age <10 ~ 2, age > 9 & age <15 ~ 3, age > 14 & age <20 ~ 4,
age > 19 & age <25 ~ 5, age > 24 & age <30 ~ 6, age > 29 & age <35 ~ 7,
age > 34 & age <40 ~ 8, age > 39 & age <45 ~ 9, age > 44 & age <50 ~ 10,
age > 49 & age <55 ~ 11, age > 54 & age <60 ~ 12, age > 59 & age <65 ~ 13,
age > 64 & age <70 ~ 14, age > 69 & age <75 ~ 15, age > 74 & age <80 ~ 16,
age > 79 & age <85 ~ 17, age > 84 & age <90 ~ 18, age > 89 ~ 19,
TRUE ~ as.numeric(age)
))
# step 3: attach other geography levels (HB, ADP, HSCP, Scotland)
CA_estimates <- CA_estimates |>
left_join(geography_lookup, by = "ca2019") |>
mutate(Scotland = "S00000001")
# step 4: pivot the data longer to create a geography code column
CA_estimates <- CA_estimates |>
pivot_longer(cols = c(ca2019, hscp2019, hb2019, adp, Scotland),
names_to = "area_type",
values_to = "code")
# step 5: sum the population estimates for each geography
# a. population estimates by geography and year only
# for calculating crude rates
CR_CA_estimates <- CA_estimates |>
group_by(year, code) |>
summarize(denominator = sum(pop), .groups = 'drop')
# b. population estimates by geography, year, age and sex
# for calculating standardised rates
SR_CA_estimates <- CA_estimates |>
rename(sex_grp = sex) |>
group_by(year, code, age_grp, sex_grp) |>
summarize(denominator = sum(pop), .groups = 'drop')
# step 6: save the lookups in a temporary sub-folder
saveRDS(CR_CA_estimates, file=paste0(scotpho_lookups_folder, 'Population/Temporary/CA_pop_', name,'.rds'))
saveRDS(SR_CA_estimates, file=paste0(scotpho_lookups_folder, 'Population/Temporary/CA_pop_', name,'_SR.rds'))
}
# function to check the historic data in the newly created lookup against the old lookup saved in the scotpho folder
dq_check <- function(filename){
old_file <- readRDS(paste0(scotpho_lookups_folder, "Population/", filename, ".rds")) # read in old file
new_file <- readRDS(paste0(scotpho_lookups_folder, "Population/Temporary/", filename, ".rds")) # read in new file
# columns to join new and old datasets on
join_cols = c("year", "code")
# also join on age and sex columns if it's an SR file
if(grepl("SR", filename)){
join_cols <- c(join_cols, "sex_grp", "age_grp")
}
# join files and check
check <- left_join(old_file, new_file, by = join_cols) |>
mutate(number_diff = denominator.x - denominator.y,
percent_diff = (number_diff/denominator.x) * 100)
rm(old_file, new_file)
return(check)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 5. Create population lookups ----
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
create_population_lookups(lower_age = 0, upper_age = 90, name = "allages")
create_population_lookups(lower_age = 16, upper_age = 90, name = "16+")
create_population_lookups(lower_age = 11, upper_age = 25, name = "11to25")
create_population_lookups(lower_age = 0, upper_age = 15, name = "under16")
create_population_lookups(lower_age = 0, upper_age = 4, name = "under5")
create_population_lookups(lower_age = 15, upper_age = 25, name = "15to25")
create_population_lookups(lower_age = 0, upper_age = 17, name = "under18")
create_population_lookups(lower_age = 0, upper_age = 17, name = "1to15")
create_population_lookups(lower_age = 12, upper_age = 90, name = "12+")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 6. Check historic data against old lookups ----
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# choosing a few new lookups and checking against the old lookup before archiving them
# some historic changes between 2012 - 2021
# everything pre 2012 an exact matc
#check <- dq_check("CA_pop_allages_SR")
#check <- dq_check("CA_pop_allages")
#check <- dq_check("CA_pop_16+_SR")
#check <- dq_check("CA_pop_16+")
#check <- dq_check("CA_pop_16+")
#check <- dq_check("CA_pop_11to25_SR")
#check <- dq_check("CA_pop_11to25")
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 7. create backups of old files ---
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# note: save the old files from the Population lookup folder into the 'backups' subfolder, then move these files
# across to the population folder after checking they are ok so they are ready to be used within the indicator analysis functions.