-
Notifications
You must be signed in to change notification settings - Fork 0
/
readDatEd.R
157 lines (130 loc) · 5.75 KB
/
readDatEd.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# an update of the readDat.R script with cleaner code for educational attainment
# read in the STATA data:
# DHS recode manual:
# https://dhsprogram.com/pubs/pdf/DHSG4/Recode6_DHS_22March2013_DHSG4.pdf
# ?:
# https://dhsprogram.com/pubs/pdf/SAR8/SAR8.pdf
# final report:
# https://dhsprogram.com/pubs/pdf/FR308/FR308.pdf
library(haven)
library(fields)
library(zoo)
library(latex2exp)
library(maptools)
library(data.table)
wd1 = getwd()
wd2 = "~/Google Drive/UW/Wakefield/WakefieldShared/U5MR/"
setwd(wd2)
# lines represent births
# data <- data.frame(read_dta("Kenya2014BirthRecode/KEBR70FL.DTA"))
data <- data.frame(read_dta("Kenya2014IndividualRecode/KEIR71FL.DTA"))
## variables from the birth recode
# extract the columns of interest
# b1 - month of birth of child,
# b2 - year of birth of child,
# b4 - gender of child, # TODO: important?
# b5 - child alive at time of interview,
# b7 - age of child at death in months completed,
# v024 - region of residence
# v025 - type of place of residence urban rural (32.3% of pop. is urban in 2009 and 2014, see FR p.30/2)
# v001 - cluster number (in theory there are 1,612 total. In actuality there are 1,593)
# v002 - household number
# v005 - sample weight out of 1000000. normalized weights sum to number of households
# v006 - month of interview
# V007 - year of interview
# V136 - total number of household members
# V138 - number of women in the household aged 15 to 49
# V137 - number of children in the household aged 5 or under
## variables from this (individual/women) recode
# v024 - region of residence
# v025 - type of place of residence urban rural (32.3% of pop. is urban in 2009 and 2014, see FR p.30/2)
# v001 - cluster number (in theory there are 1,612 total. In actuality there are 1,593)
# v002 - household number
# v005 - sample weight out of 1000000. normalized weights sum to number of households
# v006 - month of interview
# V007 - year of interview
# v012 - current age in completed years
# v106 - highest education level attended in order of: no educ, primary, secondary, higher
# v107 - highest year comleted (years completed at level given in v106)
# v149 - educational achievement: none (0), incomplete primary (1), completed primary (2), incomplete secondary (3),
# complete secondary (4), higher education (5)
subdata <- data.frame(data[,c('v024', 'v025', 'v001', 'v002', 'v005', 'v012', 'v106', 'v107', 'v149')])
# extract women with the given age range
lowAge = 20
highAge = 29
subdata <- subdata[(subdata[,'v012'] >= lowAge & subdata[,'v012'] <= highAge),]
# add a column for the stratification variable as an interaction between
# the urban/rural indicator 'v025' (1: urban, 2:rural) and the region indicator 'v024'
subdata$regionUral <- with(subdata, interaction(v024, v025), drop=TRUE)
# add a column for the unique households with interaction between
# the household indicator 'v002' and the cluster indicator 'v001'
subdata$hhold <- with(subdata, interaction(v001, v002), drop=TRUE)
# find for each cluster the regionUral indicator
clStrat = subdata[,c("v001", "regionUral", "v024", "v025", "v005")]
clStrat = clStrat[!duplicated(clStrat), ]
colnames(clStrat) = c("clusterID", "regionRural", "region", "urban", "samplingWeight")
clStrat$urban = clStrat$urban == 1
# determine whether each woman completed their secondary education
completed = subdata$v149 >= 4
completed[is.na(completed)] = TRUE
# get the number of women by cluster
n <- table(subdata[,'v001'])
clusterid <- dimnames(n)[[1]]
n.data = data.frame(clusterID=clusterid, n=as.vector(n))
# get the number of women who completed their secondary education by cluster
# y <- table(subdata[,'v001'])
y = aggregate(completed, list(subdata[,'v001']), sum)
n.data$y = y$x
# add in strata
ed <- merge(n.data, clStrat, by='clusterID', all=TRUE, sort=TRUE)
# Read geographical information
library(rgdal)
spObj = readOGR(dsn = "Kenya2014gps/", layer = "KEGE71FL")
# Extract (lon, lat) coordinates of all clusters
geoObj = data.frame(cId = spObj$DHSCLUST, lon = spObj$LONGNUM, lat = spObj$LATNUM)
# Extract coordinates of clusters with data
idx = match(ed$clusterID, geoObj$cId)
ed$lon = geoObj$lon[idx]
ed$lat = geoObj$lat[idx]
# Missing geographical information is assigned value (0,0)
# Remove these
missIdx = which(ed$lon == 0)
ed = ed[-missIdx,]
library(SUMMER)
library(foreign)
gpsDat = readShapePoints("Kenya2014gps/KEGE71FL.shp")
coords = attr(gpsDat, "coords")
plot(coords)
world(add=TRUE)
test = coords[,1] < 20 # these are the observations whose source is missing. remove these
sum(test)
# remove observations with unknown locations and set unknown data to NA
gpsDat=gpsDat[!test,]
names(gpsDat)=c("ID","countryID", "year", "clustID", "countryIDFIPS", "countryIDAdminID",
"admin1FIPS","admin1IDSALB",
"admin1SALB", "admin1ID", "admin1", "regionID", "region",
"source", "urban", "lat", "lon", "altGPS", "altRadar", "coordRef")
gpsDat$altGPS[gpsDat$altGPS == 9999] = NA
gpsDat$altGPS[gpsDat$altRadar == 9999] = NA
gpsDat$urban = gpsDat$urban == "U"
gpsDat$countryIDFIPS[gpsDat$countryIDFIPS == "NULL"] = NA
gpsDat$admin1FIPS[gpsDat$admin1FIPS == "NULL"] = NA
gpsDat$admin1IDSALB[gpsDat$admin1IDSALB == "NULL"] = NA
gpsDat$admin1SALB[gpsDat$admin1SALB == "NULL"] = NA
gpsDat$countryIDAdminID[gpsDat$countryIDAdminID == "NULL"] = NA
setwd(wd1)
save(gpsDat, file="gpsDatEd.RData")
setwd(wd2)
# get region and admin data from gps data, add to clusters in ed dataset
gpsI = match(data.frame(rbind(ed$lon, ed$lat)), data.frame(rbind(gpsDat$lon, gpsDat$lat)))
ed$admin1 = gpsDat$admin1[gpsI]
ed$region = gpsDat$region[gpsI]
# get easting and northing using projection
tmp = projKenya(ed$lon, ed$lat)
ed$east = tmp[,1]
ed$north = tmp[,2]
setwd(wd1)
save(ed, file="kenyaDataEd.RData")
setwd(wd2)
aggregate(ed$y/ed$n, by=list(ed$urban), mean)
setwd(wd1)