-
Notifications
You must be signed in to change notification settings - Fork 0
/
readDat2.R
175 lines (146 loc) · 6.17 KB
/
readDat2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# an update of the readDat.R script with cleaner code for first month mortality
# read in the STATA data:
# DHS recode manual:
# https://dhsprogram.com/pubs/pdf/DHSG4/Recode6_DHS_22March2013_DHSG4.pdf
# ?:
# https://dhsprogram.com/pubs/pdf/SAR8/SAR8.pdf
# final report:
# https://dhsprogram.com/pubs/pdf/FR308/FR308.pdf
library(haven)
library(fields)
library(zoo)
library(latex2exp)
library(maptools)
library(data.table)
wd1 = getwd()
wd2 = "~/Google Drive/UW/Wakefield/WakefieldShared/U5MR/"
setwd(wd2)
# lines represent births
data <- data.frame(read_dta("Kenya2014BirthRecode/KEBR70FL.DTA"))
# extract the columns of interest
# b1 - month of birth of child,
# b2 - year of birth of child,
# b4 - gender of child, # TODO: important?
# b5 - child alive at time of interview,
# b7 - age of child at death in months completed,
# v024 - region of residence
# v025 - type of place of residence urban rural (32.3% of pop. is urban in 2009 and 2014, see FR p.30/2)
# v001 - cluster number (in theory there are 1,612 total. In actuality there are 1,593)
# v002 - household number
# v005 - sample weight out of 1000000. normalized weights sum to number of households
# v006 - month of interview
# V007 - year of interview
# V136 - total number of household members
# V138 - number of women in the household aged 15 to 49
# V137 - number of children in the household aged 5 or under
subdata <- data.frame(data[,c('b2', 'b1', 'b5', 'b7', 'v024', 'v025', 'v001', 'v002', 'v005', 'v006', 'v007')])
# get first month mortality rate for urban and rural areas
urbanData = subdata[subdata$v025 == 1,]
ruralData = subdata[subdata$v025 != 1,]
diedUrban = urbanData[!is.na(urbanData$b7),]
diedRural = ruralData[!is.na(ruralData$b7),]
#
sum(diedUrban$b7 == 0) / nrow(urbanData)
sum(diedRural$b7 == 0) / nrow(ruralData)
sum(diedUrban$v005[diedUrban$b7 == 0]) / sum(urbanData$v005)
sum(diedRural$v005[diedRural$b7 == 0]) / sum(ruralData$v005)
# get first month mortality rate for urban and rural areas (try the other version of this variable)
urbanData = data[data$v140 == 1,]
ruralData = data[data$v140 != 1,]
diedUrban = urbanData[!is.na(urbanData$b7),]
diedRural = ruralData[!is.na(ruralData$b7),]
# naïve empirical rate
sum(diedUrban$b7 == 0) / nrow(urbanData)
sum(diedRural$b7 == 0) / nrow(ruralData)
# weighted empirical rate
sum(diedUrban$v005[diedUrban$b7 == 0]) / sum(urbanData$v005)
sum(diedRural$v005[diedRural$b7 == 0]) / sum(ruralData$v005)
# check to make sure the right proportion is urban (which is about 30%)
mean(data$v140 == 1)
mean(data$v025 == 1)
# extract births in the range 2010 to 2014 (most recent five years)
lowYear <- 2010
highYear <- 2014
subdata <- subdata[(subdata[,'b2'] >= lowYear & subdata[,'b2'] <= highYear),]
# add a column for the stratification variable as an interaction between
# the urban/rural indicator 'v025' (1: urban, 2:rural) and the region indicator 'v024'
subdata$regionUral <- with(subdata, interaction(v024, v025), drop=TRUE)
# add a column for the unique households with interaction between
# the household indicator 'v002' and the cluster indicator 'v001'
subdata$hhold <- with(subdata, interaction(v001, v002), drop=TRUE)
# find for each cluster the regionUral indicator
clStrat = subdata[,c("v001", "regionUral", "v024", "v025", "v005")]
clStrat = clStrat[!duplicated(clStrat), ]
colnames(clStrat) = c("clusterID", "regionRural", "region", "urban", "samplingWeight")
clStrat$urban = clStrat$urban == 1
# determine whether each child survived for at least one month
lived = subdata$b7 > 0
lived[is.na(lived)] = TRUE
died = !lived
# get the number of birth by cluster
n <- table(subdata[,'v001'])
clusterid <- dimnames(n)[[1]]
n.data = data.frame(clusterID=clusterid, n=as.vector(n))
# get the number of deaths by cluster
# y <- table(subdata[,'v001'])
y = aggregate(died, list(subdata[,'v001']), sum)
n.data$y = y$x
# add in strata
mort <- merge(n.data, clStrat, by='clusterID', all=TRUE, sort=TRUE)
# Read geographical information
library(rgdal)
spObj = readOGR(dsn = "Kenya2014gps/", layer = "KEGE71FL")
# Extract (lon, lat) coordinates of all clusters
geoObj = data.frame(cId = spObj$DHSCLUST, lon = spObj$LONGNUM, lat = spObj$LATNUM)
# Extract coordinates of clusters with data
idx = match(mort$clusterID, geoObj$cId)
mort$lon = geoObj$lon[idx]
mort$lat = geoObj$lat[idx]
# Missing geographical information is assigned value (0,0)
# Remove these
missIdx = which(mort$lon == 0)
mort = mort[-missIdx,]
library(SUMMER)
library(foreign)
gpsDat = readShapePoints("Kenya2014gps/KEGE71FL.shp")
coords = attr(gpsDat, "coords")
plot(coords)
world(add=TRUE)
test = coords[,1] < 20 # these are the observations whose source is missing. remove these
sum(test)
# remove observations with unknown locations and set unknown data to NA
gpsDat=gpsDat[!test,]
names(gpsDat)=c("ID","countryID", "year", "clustID", "countryIDFIPS", "countryIDAdminID",
"admin1FIPS","admin1IDSALB",
"admin1SALB", "admin1ID", "admin1", "regionID", "region",
"source", "urban", "lat", "lon", "altGPS", "altRadar", "coordRef")
gpsDat$altGPS[gpsDat$altGPS == 9999] = NA
gpsDat$altGPS[gpsDat$altRadar == 9999] = NA
gpsDat$urban = gpsDat$urban == "U"
gpsDat$countryIDFIPS[gpsDat$countryIDFIPS == "NULL"] = NA
gpsDat$admin1FIPS[gpsDat$admin1FIPS == "NULL"] = NA
gpsDat$admin1IDSALB[gpsDat$admin1IDSALB == "NULL"] = NA
gpsDat$admin1SALB[gpsDat$admin1SALB == "NULL"] = NA
gpsDat$countryIDAdminID[gpsDat$countryIDAdminID == "NULL"] = NA
setwd(wd1)
save(gpsDat, file="gpsDat.RData")
setwd(wd2)
# get region and admin data from gps data, add to clusters in mort dataset
gpsI = match(data.frame(rbind(mort$lon, mort$lat)), data.frame(rbind(gpsDat$lon, gpsDat$lat)))
mort$admin1 = gpsDat$admin1[gpsI]
mort$region = gpsDat$region[gpsI]
# get easting and northing using projection
tmp = projKenya(mort$lon, mort$lat)
mort$east = tmp[,1]
mort$north = tmp[,2]
setwd(wd1)
save(mort, file="kenyaData.RData")
setwd(wd2)
# find average urban and rural neonatal mortality rates
sum(mort$y[mort$urban])/sum(mort$n[mort$urban])
sum(mort$y[!mort$urban])/sum(mort$n[!mort$urban])
# make an extremely rudimentary linear regression model, examining the urban effect
perc = mort$y/mort$n
mod = lm(perc ~ urban + admin1, data=mort)
summary(mod)
setwd(wd1)