-
Notifications
You must be signed in to change notification settings - Fork 0
/
USP522_DiffPrivOR.Rmd
1899 lines (1508 loc) · 126 KB
/
USP522_DiffPrivOR.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "A Comparison of Census 2010 SF1 & Differentially Private Data in Oregon"
author: "By Alex Brasch"
date: Winter Quarter 2020
output:
html_document:
includes:
before_body: header.html
after_body: footer.html
code_folding: hide
highlight: zenburn
self_contained: yes
theme: darkly
toc: yes
toc_depth: 2
toc_float:
collapsed: no
toc_float: yes
word_document:
toc: yes
toc_depth: '2'
pdf_document:
toc: yes
toc_depth: '2'
editor_options:
chunk_output_type: console
always_allow_html: yes
---
```{css, echo = FALSE}
pre:not([class]) {
color: #333333;
background-color: #cccccc;
}
blockquote {
padding: 10px 20px;
margin: 0 0 20px;
font-size: 12px;
border-left: 5px solid #eee;
}
```
```{r setup, echo=FALSE, warning=FALSE, error=FALSE, results='hide', message=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 7)
```
```{r echo=FALSE, warning=FALSE, error=FALSE, results='hide', message=FALSE}
# Require the pacman package to easily load all necessary packages
if(!require(pacman)){install.packages("pacman");library(pacman)}
suppressPackageStartupMessages(p_load(
tidyverse,
tidycensus,
sf,
rgdal,
tigris,
janitor,
extrafont,
readxl,
writexl,
plotly,
htmlwidgets,
xfun))
# Set options
options(tigris_class = "sf", tigris_use_cache = F) # Return an object of class sf via tigris; do not cache Census shapefile downloads
options(stringsAsFactors = F) # R often uses a concept of factors to re-encode strings. This can be too early and too aggressive. Sometimes a string is just a string. To avoid problems delay re-encoding of strings by using stringsAsFactors = FALSE when creating data.frames.
options(dplyr.width = Inf) # In response to "Variables not shown" in dplyr; overrides the width of columns that gets printed out (i.e., to display all columns from df)
options(scipen = 999) # 'scipen': integer. A penalty to be applied when deciding to print numeric values in fixed or exponential notation. Positive values bias towards fixed and negative towards scientific notation: fixed notation will be preferred unless it is more than 'scipen' digits wider.
theme_set(theme_dark()) # pre-set the ggplot2 theme.
```
```{r echo=FALSE, warning=FALSE, error=FALSE, results='hide', message=FALSE}
# For users with Esri ArcGIS license, install and load the `arcgisbinding` package for use to export data frames to Esri feature classes
# Reference:
# https://r-arcgis.github.io/assets/arcgisbinding-vignette.html
# https://www.rdocumentation.org/packages/arcgisbinding/versions/1.0.1.229/topics/arc.write
# library(arcgisbinding)
# arc.check_product()
```
# Abstract
**A Comparison of Census 2010 SF1 & Differentially Private Data in Oregon**
Portland State University
Graduate Certificate in Applied Social Demography
USP 522 Research Practicum in Applied Demography
Winter Quarter 2020
[GitHub repository: PSU-USP522-Practicum](https://github.com/a-brasch/PSU-USP522-Practicum "GitHub repository: PSU-USP522-Practicum")
[PSU webspace: USP 522](http://web.pdx.edu/~abrasch/USP522/ "PSU webspace: USP 522")
[Esri Dashboard: Census 2010 SF1 vs. Differential Privacy: Oregon Geographies](https://maulfoster.maps.arcgis.com/apps/opsdashboard/index.html#/6fea214d11784e5aa277157e8f5db3f0 "Esri Dashboard: Census 2010 SF1 vs. Differential Privacy: Oregon Geographies")
[Esri Dashboard]
[R Visualizations]
[MS Excel Charts]
---
The U.S. Census Bureau has committed to deploy a modernized disclosure avoidance system (DAS) in order to safeguard data collected during the 2020 Census. The foundation of the updated DAS methodology is differential privacy—a set of mathematical algorithms developed by cryptographers and computer scientists intended to protect respondent confidentiality. Differential privacy leverages a suite of techniques that includes the introduction of uncertainty (i.e., noise, synthetic data) into aggregated respondent data, data-swapping, and data imputation, with the goal of ensuring that enumerated data sets cannot be backwards-engineered to identify individual respondents. Differential privacy strives to balance confidentiality and data accuracy; however, many researchers have expressed concern over these techniques' impact on data quality and the potential to adversely affect demographic research and policies, as well as the lives of racial/ethnic minorities and under-represented population subgroups (Salvo 2019, Akee 2019, Van Riper 2019, Nagle 2019). To help researchers and data users assess the "fitness for use" of 2020 census data products produced using differential privacy, the Census Bureau prepared 2010 Census Demonstration Data Products, which were made public on October 19, 2019.
Census data are the foundation of a myriad of public- and private-sector efforts that support accurate funding and resource allocation, housing policy development, and emergency preparedness. From the perspective of county and local governments in Oregon, accurate census data is imperative to the success of such activities as drawing appropriately-sized school attendance areas, assessing the impact of changes in land use and zoning, reaching at-risk populations, determining intercensal population estimates, and forecasting future populations (Salvo et. al 2019, 18). The protection of individual respondents privacy is a necessary and absolutely worthwhile endeavor, but it should not sacrifice the accuracy of data that is essential to the aforementioned processes.
The intent of this study is to better understand the effects of differential privacy on census data by comparing multiple demographic variables at various geographic scales within the state of Oregon. Of particular focus are key demographic variables that inform population forecasting and housing and land use planning, including population per age group, average household size, and occupancy rate. The geographic units of analysis include Oregon counties, places, Urban Growth Boundaries (UGB) of Oregon cities, and non-UGB county areas. For analysis and reporting purposes, the project utilizes the R programming language and RStudio integrated development environment, R Markdown (USP522_DiffPrivOR.Rmd), R packages (tidyverse, tidycensus, sf, rgdal, tigris, readxl, writexl, plotly, htmlwidgets, xfun, and arcgisbinding), U.S. Census API, Alteryx, Esri ArcGIS, and Microsoft Excel.
The results of this research, including interactive visualizations, indicate that adjustments to the differential privacy algorithms and privacy loss budget allocation are necessary to ensure that accurate data is prepared for public consumption. Census Bureau analysts have stated that several promising solutions to the widely-recognized post-processing errors have been identified. (Abowd and Velkoff 2020). Numerous researchers and data users are willing to continue to assist in the process of determining the right balance between accuracy and privacy, and it would benefit all stakeholders if the Census Bureau produced more demonstration data products to show progress on correcting post-processing errors and fine-tuning on the the differential privacy mechanism.
# Introduction
The U.S. Census Bureau has committed to deploy a modernized disclosure avoidance system (DAS) in order to safeguard data collected during the 2020 Census. The foundation of the updated DAS methodology is differential privacy—a set of mathematical algorithms developed by cryptographers and computer scientists intended to protect respondent confidentiality. The need to refine the DAS is based on research by the Census Bureau, which concluded that the methods used to protect the confidentiality of earlier census statistics can no longer adequately defend against today’s privacy threats (Abowd and Velkoff 2019). Differential privacy leverages a suite of techniques that includes the introduction of uncertainty (i.e., noise, synthetic data) into aggregated respondent data, data-swapping, and data imputation, with the goal of ensuring that enumerated data sets cannot be backwards-engineered to identify individual respondents. Differential privacy strives to balance confidentiality and data accuracy; however, many researchers have expressed concern over these techniques' impact on data quality and the potential to adversely affect demographic research and policies, as well as the lives of racial/ethnic minorities and under-represented population subgroups (Salvo 2019, Akee 2019, Van Riper 2019, Nagle 2019).
As described by the Census Bureau, differential privacy techniques allow for precise control over the amount of uncertainty that is added to respondent data according to privacy requirements (Abowd and Velkoff 2019). The crux of the debate, therefore, is the balance between privacy and usability. As in other states, numerous stakeholders in Oregon rely on census data to acquire funding, prepare population forecasts for future planning efforts, develop housing and land use policies, and prepare emergency action plans. The accuracy of the census data is imperative to successful planning, budgeting, and program delivery. To help researchers and data users assess the "fitness for use" of upcoming 2020 census data products, demonstration data were produced and made public on October 19, 2019. This set of 2010 Census data products exhibits what can be expected as a result of employing the 2020 DAS. The intent of this study is to better understand the effects of differential privacy on census data by comparing multiple demographic variables at various geographic scales within the state of Oregon. Of particular focus are key demographic variables that inform population forecasting and housing and land use planning, including population per age group, average household size, and occupancy rate. Visualization of the differentially private data in direct comparison to the Census 2010 summary file data will bring greater awareness of the issues that result from injecting unnecessarily large amounts of noise into respondent information.
# Literature Review
The Census Bureau is bound by law to protect individual respondent data gathered during the decennial census. As described in a presentation by Michael Hawes (Senior Adviser for Data Access and Privacy Research and Methodology Directorate U.S. Census Bureau) at the Oregon 2019 Census Data Users Conference, under title 13, Section 9 of the United State Code, the Census Bureau is prohibited from releasing identifiable data "furnished by any particular establishment or individual" (U.S. Census Bureau 2019f). In previous censuses, the privacy of respondent data was protected via data swapping, suppression of geographic information and extreme values, imputation, and perturbation (Ruggles et. al. 2019, 404). Considering advances in mathematics, the strength of widely available computing power, and easy access to large, public databases, there is concern that existing measures no longer adequately protect data privacy. As described by the Census Bureau, there is a growing concern that it may be possible to reconstruct a significant portion of the confidential census data using so-called database reconstruction attack methods, thereby revealing individual respondent data (U.S. Census Bureau 2019d, 5). To avert such potential breaches, the Census Bureau has committed to deploy a modernized DAS to safeguard data collected during the 2020 Census.
DAS's that rely on differential privacy methods are designed to protect individual respondent privacy through the introduction of statistical noise into the data, which produces uncertainty and reduces the likelihood that a specific individual could be identified. As described by the Census Bureau, differential privacy differs from traditional noise-injection privacy methods, in that "the amount of noise required to protect privacy is precisely calibrated to provide provable mathematical guarantees regarding the maximum amount of privacy-loss possible from the publication of data products derived from the confidential data" (U.S. Census Bureau 2019c, 7-2). A differentially private system works by allocating a "privacy-loss budget" to all statistics computed from a confidential dataset (U.S. Census Bureau 2019e). The budget can be envisioned as a spectrum, where on one end completely private data is wholly inaccurate, and on the other end unprotected data is highly accurate because it truly reflects respondent data. As part of differential privacy implementation, a point must be chosen along this spectrum. This choice is a policy decision based on a desired balance between accuracy and confidentiality, which will be made by a committee of senior career Census Bureau data experts and the Data Steward Executive Policy Committee (U.S. Census Bureau 2019e). Therefore, it is imperative for the committees to understand how differential privacy impact the fitness-of-use of the census data.
Numerous research projects have been completed by researchers, scientists, and data users in various fields of study, both prior to and since the release of the 2010 Demonstration Data Products (e.g., Ruggles et. al. 2019, Salvo 2019, Akee 2019, Van Riper 2019, Nagle 2019). Many of these studies were presented at the Workshop on 2020 Census Data Products: Data Needs and Privacy Considerations held on December 11-12, 2019 in Washington DC. The workshop—hosted by the Committee on National Statistics—provided data users a platform from which to present their assessments of the fitness-for-use of the differentially private data. The majority of these studies found considerable issues with the differentially private data, highlighting how the DAS could adversely effect planning, budgeting, and programming efforts. In an overarching rebuke, Ruggles et. al. stated that:
> The differential privacy approach is inconsistent with the statutory obligations, history, and core mission of the Census Bureau". By imposing unrealistic disclosure rules, the Census Bureau may be forced to lock up data that are indispensable for basic research and policy analysis. If public use data become unusable or inaccessible because of overzealous disclosure control, there will be a precipitous decline in the quantity and quality of evidence-based policy research (Ruggles et. al. 2019, 403-404).
In the recent past, the Census Bureau focused on targeted strategies to prevent re-identification attacks, or the ability of an outside adversary to positively identify which person provided a particular response. Although Census Bureau analysts argue that new DAS methods are necessary to prevent database reconstruction, there have been few cases of nefarious actors positively identifying individual respondents using reconstruction methods. In fact, the protections in place—sampling, swapping, suppression of geographic information and extreme values, imputation, and perturbation—have proved successful at protecting personal data (Ruggles et. al. 2019, 404). Moreover, reconstruction of microdata from summary tables does not by itself allow identification of individual respondents. As Ruggles et. al. explain, to determine who the individuals in a reconstructed database actually are, one would then have to match respondent characteristics to an external, respondent-identified database that includes information such as names or social security numbers to complete useful re-identification attack (2019, 405). This is not to suggest that data privacy should be taken lightly or that database reconstruction is not possible or of greater concern today than in the past (due to widely available computing power and the availability of private-sector, respondent-identified data); however, differential privacy may exceed what is necessary to keep data safe under law and precedent (Ruggles et. al. 2019, 406).
Presentations of research at the Committee on National Statistics echoed the preceding claims based on analysis of the 2010 Demonstration Data Products. Of particular note are the studies finding that:
1. Geographies off-spine of the Standard Hierarchy of Census Geographic Entities (e.g., Places) generally exhibit more egregious differences than on-spine geographies (Van Riper and Spielman 2019, 22),
2. The impact of the injected noise varies by place and population sub-group (Van Riper and Spielman 2019, 22), and
3. The current privacy loss budget disproportionately impacts already hard-to-count populations (e.g., American Indian, Alaska Native, and Native Hawaiian).
Additional research also found that data accuracy at the sub-state (county, city, etc.) level will be drastically reduced if the current amount of noise injected into the data is maintained. For example, in a Virginia case study, the total number of girls ages 15-19 in the City of Emporia were decreased from the actual 185 to only 30. Applying this number to the teen pregnancy rate for Emporia increased the rate from 10 percent to 66 percent (Gunter 2020, 1). The overarching finding of many of the recent studies is that statistics produced by government and third-party actors using data in which differential privacy is applied are at risk of being inconsistent and unreasonable for comparison over time.
# Methodology
Since values of variables like total population must be held constant at the state level, the adjustment of values among smaller geographic areas and statistical areas is a zero-sum game. However, recent research found that the differential privacy algorithms being applied to the demonstration data generally shift population from more populous to less populous area and from large race groups to small race groups (Gunter 2020, 1). In order to assess if similar patterns are occurring in Oregon, this study makes comparisons between the original 2010 Summary File 1 census data (i.e., SF 1, sf) and the differentially private demonstration data (i.e., 2010 Census Demonstration Data Files, dp). The geographic units of analysis include Oregon counties, places, Urban Growth Boundaries (UGB) of Oregon cities, and non-UGB county areas. For analysis and reporting purposes, the project utilizes the R programming language and RStudio integrated development environment, R Markdown, R packages (tidyverse, plotly, tidycensus, etc.), U.S. Census API, Alteryx, Esri ArcGIS, and Microsoft Excel. The results of this study, including interactive visualizations, provide the Portland State University (PSU) Population Research Center (PRC) and other Oregon stakeholders with a primer to assess the effects of differential privacy on 2020 Census summary tables and demographic assumptions used to prepare population estimates, population forecasts, housing and land use policies, and other demographic data products.
<span style="color: #BB86FC;">**Note that the majority of referenced files are embedded (i.e., available to download) within the HTML version of this R Markdown document. If not found in-line at the point of reference, the files are located in the [Supporting Data] section. Large and proprietary files were not embedded in the HTML file, but the former may be available upon request.**</span>
# Data Wrangling
To facilitate comparisons between the sf and dp data set, the two segments of data need to be retrieved and prepared for analysis. The following section contains all the necessary steps to retrieve, wrangle, reshape, and blend the sf and dp 2010 census data at the geographic levels of county, congressional district, state legislative district, place, and census block. To save on processing time and avoid local memory limitations, data-intensive code chunks have been converted to comments (e.g., retrieve all Oregon census blocks using `tigris` package). Readers can view the underlying code while the input data is read-in as part of the R Project's local data.
The 2010 Demonstration Data Products are available directly from the U.S. Census Bureau (CB) at the following locations.
[2010 Demonstration Data Products](https://www.census.gov/programs-surveys/decennial-census/2020-census/planning-management/2020-census-data-products/2010-demonstration-data-products.html "2010 Demonstration Data Products")
[2010 Demonstration Data Products FTP Site](https://www2.census.gov/programs-surveys/decennial/2020/program-management/data-product-planning/2010-demonstration-data-products/?# "2010 Demonstration Data Products FTP Site")
> As noted by the CB within the 2010 Demonstration Data Product Technical Documentation, the data in the 2010 Demonstration Data Products are segmented. This is done to manage the volume of data and to facilitate exporting into spreadsheet or database software. The data and the corresponding geographic information for an individual state are known as the file set. Because of the large size of the tables, the 2010 Demonstration Data Products’ Demographic and Housing Demonstration data files will be broken into 15 files: a geographic header record file and 14 data segment files. The Redistricting data will be broken into four files: a geographic header record file and three data segment files. To get a complete set of the 2010 Demonstration Data Products’ data files, users must download the geographic header file and all the data file segments in the package. The zip archive files are in the Unicode Transformation Format (UTF-8). The geographic header file contains fixed fields while the data segments are in a comma-delimited format. Both the geographic header and the data files contain geographic linking fields. These files were created in a Linux environment but should be usable from any operating system. Standard software packages can be used to import and manipulate these data files. These are text files, however, the file extension is not ‘.txt’. The user will need to rename the files with a .txt extension for import into some software packages.
The segmentation, delimitation, and file format of these data make preparation for analysis challenging and time-consuming. To improve ease of use and facilitate timely comparison, two organizations have processed the source data into more user-friendly file types and formats. The IPUMS National Historical Geographic Information System (NHGIS) prepared the two segments of data for comparison by joining them into a more simplified format organized by geography. [These data sets](https://www.nhgis.org/differentially-private-2010-census-data "These data sets") are recommended based on ease of use, but unfortunately, census block data is unavailable, which is required by this study to derive Oregon UGB values based on census block aggregation. Therefore, the data sets prepared by the Cornell Institute for Social and Economic Research (CISER) will be used.
## Demonstration Data Products
CISER prepared simplified alternatives to the files included on the CB’s file transfer protocol site, representing both the 2010 Demographic and Housing Demonstration Files. The combined file was created in SAS and then exported to SPSS, STATA, and CSV using Stat/Transfer. A similar file exists for the 2010 Census SF1 data segments for comparison. The files are available at the following locations.
[Census 2010 DHC Download Center](https://ciser.cornell.edu/data/data-archive/census-2010-dhc-download-center/ "Census 2010 DHC Download Center")
[Census 2010 SF1 Download Center](https://ciser.cornell.edu/data/data-archive/census-2010-sf1-download-center/ "Census 2010 SF1 Download Center")
<span style="color: #BB86FC;">**Note that all code chunks included below were created using the R programming language, RStudio integrated development environment, R Markdown, and R packages (tidyverse, tidycensus, sf, rgdal, tigris, readxl, writexl, plotly, htmlwidgets, xfun, and arcgisbinding).**</span>
Read in the raw sf and dp Oregon CSV files from CISER.
```{r}
# Read-in complete CISER dp dataset
# dp_wide_CISER <- read_csv("./Data/CISER/OR2010DHCCSV/OR2010DHC.CSV", col_types = cols(.default = "c")) %>% # cols(.default) to set all to (c) character
# mutate_at(c(102:2686), as.double) %>% # Reassign all non-geographic identifier variables as numeric double
# mutate_if(is.numeric, ~replace(., is.na(.), 0)) # Replace NA with zero in all numeric variables
# Read-in complete CISER sf dataset
# sf_wide_CISER <- read_csv("./Data/CISER/or2010ur1_all_vars_csv/or2010ur1_all_vars.CSV", col_types = cols(.default = "c")) %>% # cols(.default) to set all to (c) character
# mutate_at(c(102:9060), as.double) %>% # Reassign all non-geographic identifier variables as numeric double
# mutate_if(is.numeric, ~replace(., is.na(.), 0)) # Replace NA with zero in all numeric variables
```
Save as RDS files.
```{r}
# saveRDS(dp_wide_CISER, "./Data/CISER/dp_wide_CISER.rds")
# saveRDS(sf_wide_CISER, "./Data/CISER/sf_wide_CISER.rds")
```
Read in dp and sf RDS files that contain all variables.
```{r class.source = 'fold-show'}
dp_wide_CISER <- readRDS(file = "./Data/CISER/dp_wide_CISER.rds")
sf_wide_CISER <- readRDS(file = "./Data/CISER/sf_wide_CISER.rds")
# Field map of dp
# Observations: 227,437
# Geographic identifiers: 101
# Variables: 2,585
# Total: 2,686
#
# Field map of sf
# Observations: 227,440
# Geographic identifiers: 101
# Variables: 8,959
# Total: 9,060
```
Note that there are 3 more observations in the sf dataset, based on a comparison of the data sets. The three missing observations are unified school districts (NAME):
- Yoncalla School District 32
- School District Not Defined
- Remainder of Oregon
Curate a list of variables of interest and rename the variables to improve recognition. Perform this task outside of the R environment as follows.
The 2010 Census Summary File 1 Technical Documentation PDF file contains metadata on each variable. The file is represented in a machine-readable format within an MS Access database created and supplied by PSU PRC (SF1_PRC.accdb), specifically within the DATA_FIELD_DESCRIPTORS_SF1_PRC table.
Use Alteryx `USP522_PRC_SF1_VariableSelection.yxmd` to create additional necessary variables:
- TABLE NAME
- UNIVERSE
- FIELD DESCRPTION (concatenation of [FIELD NAME]_[TABLE NAME]_[UNIVERSE])
Additionally, remove "Table Name" and "Universe" values from the "Field Name" variable.
Import the augmented tables in the SF1_PRC.accdb as:
- DATA_FIELD_DESCRIPTORS_SF1_PRC_AB
- DATA_FIELD_DESCRIPTORS_SF1_PRC_AB_VarOnly
Create new variables and import into SF1_PRC.accdb (replacing the existing table) as DATA_FIELD_DESCRIPTORS_SF1_PRC_AB_VarOnly:
- USE (Yes or NULL) to represent those variables to include in analysis.
- FIELD_CODE_DESC is a short descriptive name for variables of interest.
Use Alteryx `USP522_PRC_SF1_VariableSelection.yxmd` to create a list of curated variables of interest, including geographic identifiers, as well as a list of new names (FIELD_CODE_DESC) for curated variables. Both lists are formatted in the syntax expected by R/dplyr. Copy/paste these vectors into the `select` and `rename` functions in the `CISER cull and rename` code chunk below.
Reference the following two resources to review the crosswalk of 2010 to 2020 (dp) table numbers and variables IDs.
`2020-census-data-products-planning-crosswalk.xlsx`
`2010_Demonstration_Data_Product_Technical_Document.pdf`
Note that some 2010 tables do not have a 2020 equivalent. For instance, 2010 P42 (Group Quarters Population by Major Group Quarters Type) does not have a 2020 equivalent; instead use the 2010 P43 (Group Quarters Population by Sex by Age by Major Group Quarters Type) [2020 table number P15].
Also note that some (at least one) variables have been renamed. The 2010 H1 table variable H00010001 (total housing units) was renamed to H0010001 in the 2020 (dp) dataset.
Utilize the curated variable list to cull and rename variables from the complete CISER data sets.
```{r CISER cull and rename, class.source = 'fold-show'}
# dp
dp_wide <- dp_wide_CISER %>%
select(FILEID,STUSAB,SUMLEV,GEOCOMP,CHARITER,CIFSN,LOGRECNO,REGION,DIVISION,STATE,COUNTY,PLACE,TRACT,BLKGRP,BLOCK,CD,SLDU,SLDL,NAME,P0010001,P0050001,P0050002,P0050003,P0050004,P0050005,P0050006,P0050007,P0050008,P0050009,P0050010,P0050011,P0050012,P0050013,P0050014,P0050015,P0050016,P0050017,P0120001,P0120002,P0120003,P0120004,P0120005,P0120006,P0120007,P0120008,P0120009,P0120010,P0120011,P0120012,P0120013,P0120014,P0120015,P0120016,P0120017,P0120018,P0120019,P0120020,P0120021,P0120022,P0120023,P0120024,P0120025,P0120026,P0120027,P0120028,P0120029,P0120030,P0120031,P0120032,P0120033,P0120034,P0120035,P0120036,P0120037,P0120038,P0120039,P0120040,P0120041,P0120042,P0120043,P0120044,P0120045,P0120046,P0120047,P0120048,P0120049,P0150001,P0160001,P0430001,H0010001,H0030001,H0030002,H0030003) %>% # Select curated list of variables
rename(pop_total = P0010001, pop_race_total = P0050001, pop_race_notHisp_tot = P0050002, pop_race_notHisp_white = P0050003, pop_race_notHisp_black = P0050004, pop_race_notHisp_amian = P0050005, pop_race_notHisp_asian = P0050006, pop_race_notHisp_nhpi = P0050007, pop_race_notHisp_other = P0050008, pop_race_notHisp_tom = P0050009, pop_race_Hisp_tot = P0050010, pop_race_Hisp_white = P0050011, pop_race_Hisp_black = P0050012, pop_race_Hisp_amian = P0050013, pop_race_Hisp_asian = P0050014, pop_race_Hisp_nhpi = P0050015, pop_race_Hisp_other = P0050016, pop_race_Hisp_tom = P0050017, pop_sex_total = P0120001, pop_sex_m_tot = P0120002, pop_sex_m_00_04 = P0120003, pop_sex_m_05_09 = P0120004, pop_sex_m_10_14 = P0120005, pop_sex_m_15_17 = P0120006, pop_sex_m_18_19 = P0120007, pop_sex_m_20 = P0120008, pop_sex_m_21 = P0120009, pop_sex_m_22_24 = P0120010, pop_sex_m_25_29 = P0120011, pop_sex_m_30_34 = P0120012, pop_sex_m_35_39 = P0120013, pop_sex_m_40_44 = P0120014, pop_sex_m_45_49 = P0120015, pop_sex_m_50_54 = P0120016, pop_sex_m_55_59 = P0120017, pop_sex_m_60_61 = P0120018, pop_sex_m_62_64 = P0120019, pop_sex_m_65_66 = P0120020, pop_sex_m_67_69 = P0120021, pop_sex_m_70_74 = P0120022, pop_sex_m_75_79 = P0120023, pop_sex_m_80_84 = P0120024, pop_sex_m_85_over = P0120025, pop_sex_f_tot = P0120026, pop_sex_f_00_04 = P0120027, pop_sex_f_05_09 = P0120028, pop_sex_f_10_14 = P0120029, pop_sex_f_15_17 = P0120030, pop_sex_f_18_19 = P0120031, pop_sex_f_20 = P0120032, pop_sex_f_21 = P0120033, pop_sex_f_22_24 = P0120034, pop_sex_f_25_29 = P0120035, pop_sex_f_30_34 = P0120036, pop_sex_f_35_39 = P0120037, pop_sex_f_40_44 = P0120038, pop_sex_f_45_49 = P0120039, pop_sex_f_50_54 = P0120040, pop_sex_f_55_59 = P0120041, pop_sex_f_60_61 = P0120042, pop_sex_f_62_64 = P0120043, pop_sex_f_65_66 = P0120044, pop_sex_f_67_69 = P0120045, pop_sex_f_70_74 = P0120046, pop_sex_f_75_79 = P0120047, pop_sex_f_80_84 = P0120048, pop_sex_f_85_over = P0120049, hh_total = P0150001, pop_hh_total = P0160001, pop_gq_total = P0430001, hu_total = H0010001, hu_occpan_total = H0030001, hu_occpan_occ_tot = H0030002, hu_occpan_vac_tot = H0030003) # Rename variables to improve recognition
# sf
sf_wide <- sf_wide_CISER %>%
select(FILEID,STUSAB,SUMLEV,GEOCOMP,CHARITER,CIFSN,LOGRECNO,REGION,DIVISION,STATE,COUNTY,PLACE,TRACT,BLKGRP,BLOCK,CD,SLDU,SLDL,NAME,P0010001,P0050001,P0050002,P0050003,P0050004,P0050005,P0050006,P0050007,P0050008,P0050009,P0050010,P0050011,P0050012,P0050013,P0050014,P0050015,P0050016,P0050017,P0120001,P0120002,P0120003,P0120004,P0120005,P0120006,P0120007,P0120008,P0120009,P0120010,P0120011,P0120012,P0120013,P0120014,P0120015,P0120016,P0120017,P0120018,P0120019,P0120020,P0120021,P0120022,P0120023,P0120024,P0120025,P0120026,P0120027,P0120028,P0120029,P0120030,P0120031,P0120032,P0120033,P0120034,P0120035,P0120036,P0120037,P0120038,P0120039,P0120040,P0120041,P0120042,P0120043,P0120044,P0120045,P0120046,P0120047,P0120048,P0120049,P0150001,P0160001,P0430001,H00010001,H0030001,H0030002,H0030003) %>% # Select curated list of variables
rename(pop_total = P0010001, pop_race_total = P0050001, pop_race_notHisp_tot = P0050002, pop_race_notHisp_white = P0050003, pop_race_notHisp_black = P0050004, pop_race_notHisp_amian = P0050005, pop_race_notHisp_asian = P0050006, pop_race_notHisp_nhpi = P0050007, pop_race_notHisp_other = P0050008, pop_race_notHisp_tom = P0050009, pop_race_Hisp_tot = P0050010, pop_race_Hisp_white = P0050011, pop_race_Hisp_black = P0050012, pop_race_Hisp_amian = P0050013, pop_race_Hisp_asian = P0050014, pop_race_Hisp_nhpi = P0050015, pop_race_Hisp_other = P0050016, pop_race_Hisp_tom = P0050017, pop_sex_total = P0120001, pop_sex_m_tot = P0120002, pop_sex_m_00_04 = P0120003, pop_sex_m_05_09 = P0120004, pop_sex_m_10_14 = P0120005, pop_sex_m_15_17 = P0120006, pop_sex_m_18_19 = P0120007, pop_sex_m_20 = P0120008, pop_sex_m_21 = P0120009, pop_sex_m_22_24 = P0120010, pop_sex_m_25_29 = P0120011, pop_sex_m_30_34 = P0120012, pop_sex_m_35_39 = P0120013, pop_sex_m_40_44 = P0120014, pop_sex_m_45_49 = P0120015, pop_sex_m_50_54 = P0120016, pop_sex_m_55_59 = P0120017, pop_sex_m_60_61 = P0120018, pop_sex_m_62_64 = P0120019, pop_sex_m_65_66 = P0120020, pop_sex_m_67_69 = P0120021, pop_sex_m_70_74 = P0120022, pop_sex_m_75_79 = P0120023, pop_sex_m_80_84 = P0120024, pop_sex_m_85_over = P0120025, pop_sex_f_tot = P0120026, pop_sex_f_00_04 = P0120027, pop_sex_f_05_09 = P0120028, pop_sex_f_10_14 = P0120029, pop_sex_f_15_17 = P0120030, pop_sex_f_18_19 = P0120031, pop_sex_f_20 = P0120032, pop_sex_f_21 = P0120033, pop_sex_f_22_24 = P0120034, pop_sex_f_25_29 = P0120035, pop_sex_f_30_34 = P0120036, pop_sex_f_35_39 = P0120037, pop_sex_f_40_44 = P0120038, pop_sex_f_45_49 = P0120039, pop_sex_f_50_54 = P0120040, pop_sex_f_55_59 = P0120041, pop_sex_f_60_61 = P0120042, pop_sex_f_62_64 = P0120043, pop_sex_f_65_66 = P0120044, pop_sex_f_67_69 = P0120045, pop_sex_f_70_74 = P0120046, pop_sex_f_75_79 = P0120047, pop_sex_f_80_84 = P0120048, pop_sex_f_85_over = P0120049, hh_total = P0150001, pop_hh_total = P0160001, pop_gq_total = P0430001, hu_total = H00010001, hu_occpan_total = H0030001, hu_occpan_occ_tot = H0030002, hu_occpan_vac_tot = H0030003) # Rename variables to improve recognition
# Field map
# Geographic identifiers: 19
# Variables: 74
# Total: 93
```
Remove the complete CISER data frames from the R environment (to save on memory).
```{r}
remove(dp_wide_CISER)
remove(sf_wide_CISER)
```
Calculate additional variables.
```{r}
# dp
# Calculate missing 5-year age groups by sex variables by summing composite variables
# male
dp_wide <- dp_wide %>%
mutate(pop_sex_m_15_19 = pop_sex_m_15_17 + pop_sex_m_18_19) %>%
mutate(pop_sex_m_20_24 = pop_sex_m_20 + pop_sex_m_21 + pop_sex_m_22_24) %>%
mutate(pop_sex_m_60_64 = pop_sex_m_60_61 + pop_sex_m_62_64) %>%
mutate(pop_sex_m_65_69 = pop_sex_m_65_66 + pop_sex_m_67_69)
# female
dp_wide <- dp_wide %>%
mutate(pop_sex_f_15_19 = pop_sex_f_15_17 + pop_sex_f_18_19) %>%
mutate(pop_sex_f_20_24 = pop_sex_f_20 + pop_sex_f_21 + pop_sex_f_22_24) %>%
mutate(pop_sex_f_60_64 = pop_sex_f_60_61 + pop_sex_f_62_64) %>%
mutate(pop_sex_f_65_69 = pop_sex_f_65_66 + pop_sex_f_67_69)
# Calculate average household size
dp_wide <- dp_wide %>%
mutate(hh_avgsz = pop_hh_total / hu_occpan_occ_tot)
# Calculate occupancy and vacancy percentages
dp_wide <- dp_wide %>%
mutate(hu_occpan_occ_perc = hu_occpan_occ_tot / hu_occpan_total) %>%
mutate(hu_occpan_vac_perc = hu_occpan_vac_tot / hu_occpan_total)
dp_wide <- dp_wide %>%
mutate(pop_sex_sexratio = (pop_sex_m_tot / pop_sex_f_tot) * 100)
# sf
# Calculate missing 5-year age groups by sex variables by summing composite variables
# male
sf_wide <- sf_wide %>%
mutate(pop_sex_m_15_19 = pop_sex_m_15_17 + pop_sex_m_18_19) %>%
mutate(pop_sex_m_20_24 = pop_sex_m_20 + pop_sex_m_21 + pop_sex_m_22_24) %>%
mutate(pop_sex_m_60_64 = pop_sex_m_60_61 + pop_sex_m_62_64) %>%
mutate(pop_sex_m_65_69 = pop_sex_m_65_66 + pop_sex_m_67_69)
# female
sf_wide <- sf_wide %>%
mutate(pop_sex_f_15_19 = pop_sex_f_15_17 + pop_sex_f_18_19) %>%
mutate(pop_sex_f_20_24 = pop_sex_f_20 + pop_sex_f_21 + pop_sex_f_22_24) %>%
mutate(pop_sex_f_60_64 = pop_sex_f_60_61 + pop_sex_f_62_64) %>%
mutate(pop_sex_f_65_69 = pop_sex_f_65_66 + pop_sex_f_67_69)
# Calculate average household size
sf_wide <- sf_wide %>%
mutate(hh_avgsz = pop_hh_total / hu_occpan_occ_tot)
# Calculate occupancy and vacancy percentages
sf_wide <- sf_wide %>%
mutate(hu_occpan_occ_perc = hu_occpan_occ_tot / hu_occpan_total) %>%
mutate(hu_occpan_vac_perc = hu_occpan_vac_tot / hu_occpan_total)
# Calculate sex ratio
sf_wide <- sf_wide %>%
mutate(pop_sex_sexratio = (pop_sex_m_tot / pop_sex_f_tot) * 100)
# Field map
# Geographic identifiers: 19
# Existing variables: 74
# New variables: 12
# Variables: 86
# Total: 105
```
Reshape data frame to long format.
```{r class.source = 'fold-show'}
# CISER dp data
dp_long <- dp_wide %>%
gather(variable, dp, 20:length(.)) %>% # Transpose all non-geographic variables
mutate(dp = as.numeric(dp, na.rm = T)) # Set the dp variable to be numeric and ensure NULLs are handled with na.rm = TRUE
# CISER sf data
sf_long <- sf_wide %>%
gather(variable, sf, 20:length(.)) %>% # Transpose all non-geographic variables
mutate(sf = as.numeric(sf, na.rm = T)) # Set the sf variable to be numeric and ensure NULLs are handled with na.rm = TRUE
# Field map
# Geographic identifiers: 19
# Variable names: 1
# Variable values: 1
# Total: 21
# 19,559,582 observations (227,437 * 86 name-value pairs)
# 19,559,840 observations (227,440 * 86 name-value pairs)
```
Remove the wide data frames.
```{r}
remove(dp_wide)
remove(sf_wide)
```
Calculate total population by age variables.
```{r}
# dp
# Combine (sum) the male and female age group variables
# Determine fields to GroupBy
long_gbcol = names(dp_long)[-21] # All fields except for values in the 21st column
dp_long_mf <- dp_long %>%
filter(str_detect(variable, "^pop_sex_m_") | str_detect(variable, "^pop_sex_f_")) %>% # Filter all male and female age group variables
mutate(variable=str_replace(variable,"m_","tot_"), variable=str_replace(variable,"f_","tot_")) %>% # Remove male/female from variable name
group_by_at(vars(one_of(long_gbcol))) %>% # GroupBy key columns, all except for dp values column
summarise(dp = sum(dp, na.rm=T)) %>% # Sum values per group (i.e. [m] + [f]); ensure na.rm=T to handle NULLs properly
ungroup() %>% # Ungroup to return to the starting dataframe structure
filter(variable != "pop_sex_tot_tot") # Remove the sum of m_tot and f_tot
# Union the new age group total variables to the dp_long data frame
dp_long <- rbind(dp_long, dp_long_mf)
remove(dp_long_mf)
# sf
# Combine (sum) the male and female age group variables
sf_long_mf <- sf_long %>%
filter(str_detect(variable, "^pop_sex_m_") | str_detect(variable, "^pop_sex_f_")) %>% # Filter all male and female age group variables
mutate(variable=str_replace(variable,"m_","tot_"), variable=str_replace(variable,"f_","tot_")) %>% # Remove male/female from variable name
group_by_at(vars(one_of(long_gbcol))) %>% # GroupBy key columns, all except for dp values in the 46th column
summarise(sf = sum(sf, na.rm=T)) %>% # Sum values per group (i.e. [m] + [f]); ensure na.rm=T to handle NULLs properly
ungroup() %>% # Ungroup to return to the starting dataframe structure
filter(variable != "pop_sex_tot_tot") # Remove the sum of m_tot and f_tot
# Union the new age group total variables to the sf_long data frame
sf_long <- rbind(sf_long, sf_long_mf)
remove(sf_long_mf)
```
Calculate race variables that do not distinguish between Hispanic or Latino origin.
```{r}
# dp
# Combine (sum) the Hispanic & not Hispanic race variables
dp_long_race <- dp_long %>%
filter(str_detect(variable, "^pop_race_notHisp_") | str_detect(variable, "^pop_race_Hisp")) %>% # Filter all race variables
mutate(variable=str_replace(variable,"notHisp_","tot_"), variable=str_replace(variable,"Hisp_","tot_")) %>% # Remove notHisp and Hisp from variable name
group_by_at(vars(one_of(long_gbcol))) %>% # GroupBy key columns, all except for dp values column
summarise(dp = sum(dp, na.rm=T)) %>% # Sum values per group
ungroup() %>% # Ungroup to return to the starting dataframe structure
filter(variable != "pop_race_tot_tot") # Remove the sum of pop_race_Hisp_tot and pop_race_notHisp_tot
# Union the new total variables to the dp_long data frame
dp_long <- rbind(dp_long, dp_long_race)
remove(dp_long_race)
# sf
# Combine (sum) the male and female age group variables
sf_long_race <- sf_long %>%
filter(str_detect(variable, "^pop_race_notHisp_") | str_detect(variable, "^pop_race_Hisp")) %>% # Filter all race variables
mutate(variable=str_replace(variable,"notHisp_","tot_"), variable=str_replace(variable,"Hisp_","tot_")) %>% # Remove notHisp and Hisp from variable name
group_by_at(vars(one_of(long_gbcol))) %>% # GroupBy key columns, all except for sf values column
summarise(sf = sum(sf, na.rm=T)) %>% # Sum values per group
ungroup() %>% # Ungroup to return to the starting dataframe structure
filter(variable != "pop_race_tot_tot") # Remove the sum of pop_race_Hisp_tot and pop_race_notHisp_tot
# Union the new total variables to the sf_long data frame
sf_long <- rbind(sf_long, sf_long_race)
remove(sf_long_race)
remove(long_gbcol)
# Field map
# Geographic identifiers: 19
# Variable names: 1
# Variable values: 1
# Total: 21
# Name-value pairs added: 34 (+ 86 = 120)
# 27,292,440 observations (227,437 * 120 name-value pairs)
# 27,292,800 observations (227,440 * 120 name-value pairs)
```
Save out dp and sf long data frames to RDS files.
```{r}
# saveRDS(dp_long, "./Data/intermediates/dp_long.rds")
# saveRDS(sf_long, "./Data/intermediates/sf_long.rds")
```
Subset data by geography: counties, congressional districts, state legislative districts, places, and blocks.
Reference: 2010 Census Summary File 1.pdf
Count of (2010) Oregon geographies of interest:
- Counties: 36
- Congressional Districts: 5 (111th Congress January 2009-January 2011) and 113th Congress January 2013-January 2015)
- State legislative districts, upper chamber (senate): 30
- State legislative districts, lower chamber (house): 60
- Places: 377 (242 incorporated places and 135 census designated places (CDPs))
- Tracts: 834
- Block groups: 2,634
- Blocks: 196,621
```{r}
# Counties
dp_long_co <- dp_long %>%
filter(SUMLEV == "050" & GEOCOMP == "00")
sf_long_co <- sf_long %>%
filter(SUMLEV == "050" & GEOCOMP == "00")
# Congressional Districts
dp_long_cd <- dp_long %>%
filter(SUMLEV == "500")
sf_long_cd <- sf_long %>%
filter(SUMLEV == "500")
# State Legislative Districts, upper and lower chambers
dp_long_sldu <- dp_long %>%
filter(SUMLEV == "610")
sf_long_sldu <- sf_long %>%
filter(SUMLEV == "610")
dp_long_sldl <- dp_long %>%
filter(SUMLEV == "620")
sf_long_sldl <- sf_long %>%
filter(SUMLEV == "620")
# Places
dp_long_pl <- dp_long %>%
filter(SUMLEV == "160" & GEOCOMP == "00")
sf_long_pl <- sf_long %>%
filter(SUMLEV == "160" & GEOCOMP == "00")
# Blocks
dp_long_bl <- dp_long %>%
filter(SUMLEV == "100")
sf_long_bl <- sf_long %>%
filter(SUMLEV == "100")
# Field map
# Geographic identifiers: 19
# Variable names: 1
# Variable values: 1
# Total: 21
# Number of Obs = x geographic entities * y name-value pairs (e.g., 5 congressional districts * 120 = 600 observations)
```
Create geographic identifiers (GEOID). [Reference Understanding Geographic Identifiers](https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html# "Reference Understanding Geographic Identifiers")
```{r}
# dp
dp_long_co <- dp_long_co %>%
unite(GEOID, c("STATE", "COUNTY"), sep = "", remove = F) %>% # Concatenate fields, sep = "" as not to include underscore between values, and remove = F as not to remove to originator fields
select(GEOID, everything()) # Place GEOID field at the beginning of the data frame
dp_long_cd <- dp_long_cd %>%
unite(GEOID, c("STATE", "CD"), sep = "", remove = F) %>%
select(GEOID, everything())
dp_long_sldu <- dp_long_sldu %>%
unite(GEOID, c("STATE", "SLDU"), sep = "", remove = F) %>%
select(GEOID, everything())
dp_long_sldl <- dp_long_sldl %>%
unite(GEOID, c("STATE", "SLDL"), sep = "", remove = F) %>%
select(GEOID, everything())
dp_long_pl <- dp_long_pl %>%
unite(GEOID, c("STATE", "PLACE"), sep = "", remove = F) %>%
select(GEOID, everything())
dp_long_bl <- dp_long_bl %>%
unite(GEOID, c("STATE", "COUNTY", "TRACT", "BLOCK"), sep = "", remove = F) %>%
select(GEOID, everything())
# sf
sf_long_co <- sf_long_co %>%
unite(GEOID, c("STATE", "COUNTY"), sep = "", remove = F) %>%
select(GEOID, everything())
sf_long_cd <- sf_long_cd %>%
unite(GEOID, c("STATE", "CD"), sep = "", remove = F) %>%
select(GEOID, everything())
sf_long_sldu <- sf_long_sldu %>%
unite(GEOID, c("STATE", "SLDU"), sep = "", remove = F) %>%
select(GEOID, everything())
sf_long_sldl <- sf_long_sldl %>%
unite(GEOID, c("STATE", "SLDL"), sep = "", remove = F) %>%
select(GEOID, everything())
sf_long_pl <- sf_long_pl %>%
unite(GEOID, c("STATE", "PLACE"), sep = "", remove = F) %>%
select(GEOID, everything())
sf_long_bl <- sf_long_bl %>%
unite(GEOID, c("STATE", "COUNTY", "TRACT", "BLOCK"), sep = "", remove = F) %>%
select(GEOID, everything())
# Field map
# Geographic identifiers: 20
# Variable names: 1
# Variable values: 1
# Total: 22
```
Remove the non-geographically subset data frames from the R environment (to save on memory).
```{r}
remove(dp_long)
remove(sf_long)
```
Union the geographically subsetted data frames, keeping only the GEOID and NAME geographic identifiers.
```{r}
dp_long_co <- dp_long_co %>%
mutate(GEOGRAPHY = "County") %>% # Create GEOGRAPHY variable
select(GEOGRAPHY, GEOID, NAME, variable, dp) # Remove unncessary fields
dp_long_cd <- dp_long_cd %>%
mutate(GEOGRAPHY = "Congressional District") %>%
select(GEOGRAPHY, GEOID, NAME, variable, dp)
dp_long_sldu <- dp_long_sldu %>%
mutate(GEOGRAPHY = "State Legislative District Upper House") %>%
select(GEOGRAPHY, GEOID, NAME, variable, dp)
dp_long_sldl <- dp_long_sldl %>%
mutate(GEOGRAPHY = "State Legislative District Lower House") %>%
select(GEOGRAPHY, GEOID, NAME, variable, dp)
dp_long_pl <- dp_long_pl %>%
mutate(GEOGRAPHY = "Place") %>%
select(GEOGRAPHY, GEOID, NAME, variable, dp)
dp_long_bl <- dp_long_bl %>%
mutate(GEOGRAPHY = "Block") %>%
select(GEOGRAPHY, GEOID, NAME, variable, dp)
sf_long_co <- sf_long_co %>%
mutate(GEOGRAPHY = "County") %>%
select(GEOGRAPHY, GEOID, NAME, variable, sf)
sf_long_cd <- sf_long_cd %>%
mutate(GEOGRAPHY = "Congressional District") %>%
select(GEOGRAPHY, GEOID, NAME, variable, sf)
sf_long_sldu <- sf_long_sldu %>%
mutate(GEOGRAPHY = "State Legislative District Upper House") %>%
select(GEOGRAPHY, GEOID, NAME, variable, sf)
sf_long_sldl <- sf_long_sldl %>%
mutate(GEOGRAPHY = "State Legislative District Lower House") %>%
select(GEOGRAPHY, GEOID, NAME, variable, sf)
sf_long_pl <- sf_long_pl %>%
mutate(GEOGRAPHY = "Place") %>%
select(GEOGRAPHY, GEOID, NAME, variable, sf)
sf_long_bl <- sf_long_bl %>%
mutate(GEOGRAPHY = "Block") %>%
select(GEOGRAPHY, GEOID, NAME, variable, sf)
# Union the data frames
dp_long_geog <- rbind(dp_long_co, dp_long_cd, dp_long_sldu, dp_long_sldl, dp_long_pl, dp_long_bl)
sf_long_geog <- rbind(sf_long_co, sf_long_cd, sf_long_sldu, sf_long_sldl, sf_long_pl, sf_long_bl)
# Field map
# Geographic identifiers: 3
# Variable names: 1
# Variable values: 1
# Total: 5
# Obs: 23,655,480 (sum of geog obs)
```
Join dp and sf data frames, and calculate the difference between dp and sf variables.
```{r class.source = 'fold-show'}
# Join based on multiple fields
dpsf_long_geog <- dp_long_geog %>%
inner_join(sf_long_geog, by = c('GEOGRAPHY' = 'GEOGRAPHY', 'GEOID' = 'GEOID', 'NAME' = 'NAME', 'variable' = 'variable')) %>%
mutate(diff = dp - sf)
# Field map
# Geographic identifiers: 3
# Variable names: 1
# Variable values: 3
# Total: 7
# Obs: 23,655,480
```
Reshape to wide format.
```{r class.source = 'fold-show'}
# Create a function to gather %>% unite %>% spread for repeated use
spread_n <- function(df, key, value) {
# quote key
keyq <- rlang::enquo(key)
# break value vector into quotes
valueq <- rlang::enquo(value)
s <- rlang::quos(!!valueq)
df %>% gather(value_type, value, !!!s) %>%
unite(temp, !!keyq, value_type) %>%
spread(temp, value)}
dpsf_wide_geog <- dpsf_long_geog %>%
spread_n(variable, c(dp, sf, diff)) # Spread on multiple value variables, adding variable name as a suffix
# Field map
# Geographic identifiers: 3
# Variables: 360 (120 * 3 [dp, sf, diff])
# Total: 363
# Obs: 197,129 (sum of all geographic entities)
```
Save out the dpsf long and wide geographically categorized and subsetted data frames to RDS files.
```{r}
# Save as RDS
# saveRDS(dp_long_geog, "./Data/intermediates/dp_long_geog.rds")
# saveRDS(sf_long_geog, "./Data/intermediates/sf_long_geog.rds")
# saveRDS(dpsf_long_geog, "./Data/intermediates/dpsf_long_geog.rds")
# saveRDS(dpsf_wide_geog, "./Data/intermediates/dpsf_wide_geog.rds")
```
## Spatial Analysis
Retrieve geospatial data for all Oregon counties, congressional districts, state legislative districts (upper and lower), places, and blocks.
```{r warning=FALSE, message=FALSE, results='hide'}
# Retrieve data via tigris package
# Reference: https://cran.r-project.org/web/packages/tigris/tigris.pdf
# By default `tigris` retrieves the most recent vintage of a dataset, so year specification is required
# Default Census TIGER/Line coordinate reference system (CRS) is NAD83 EPSG 4269
# https://spatialreference.org/ref/epsg/nad83)
# gc() # A call of gc causes a garbage collection to take place. The primary purpose of calling gc is for the report on memory usage. Use this before and after a data-heavy processing task.
# shp_co <- tigris::counties("OR", year = 2010, cb = FALSE)
# shp_bl <- tigris::blocks("OR", year = 2010) # Only TIGER/Line Shapefiles available, so no cb parameter necessary
# gc()
# Cannot specify year for cd, sldu, sldl, and pl: "Error: tigris::congressional_districts is not currently available for years prior to 2011. To request this feature, file an issue at https://github.com/walkerke/tigris."
# shp_cd <- tigris::congressional_districts(cb = FALSE) %>%
# filter(STATEFP == "41") # State specification parameter non-existant, so filter to Oregon using STATEFP
# shp_sldu <- tigris::state_legislative_districts("OR", house = "upper", cb = FALSE) # Specify house, cannot specify year (see above)
# shp_sldl <- tigris::state_legislative_districts("OR", house = "lower", cb = FALSE) # Specify house, cannot specify year (see above)
# shp_pl <- tigris::places("OR")
# Therefore, obtain manually from Census TIGER/Line web interface
# https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html
# shp_cd <- st_read("./Data/Census/TIGER/tl_2010_41_cd111/tl_2010_41_cd111.shp")
# shp_sldu <- st_read("./Data/Census/TIGER/tl_2010_41_sldu10/tl_2010_41_sldu10.shp")
# shp_sldl <- st_read("./Data/Census/TIGER/tl_2010_41_sldl10/tl_2010_41_sldl10.shp")
# shp_pl <- st_read("./Data/Census/TIGER/tl_2010_41_place10/tl_2010_41_place10.shp")
```
Save out the geospatial data to RDS files.
```{r}
# Save out as RDS files
# saveRDS(shp_co,"./Data/intermediates/shp_co.rds")
# saveRDS(shp_cd,"./Data/intermediates/shp_cd.rds")
# saveRDS(shp_sldu,"./Data/intermediates/shp_sldu.rds")
# saveRDS(shp_sldl,"./Data/intermediates/shp_sldl.rds")
# saveRDS(shp_pl,"./Data/intermediates/shp_pl.rds")
# saveRDS(shp_bl,"./Data/intermediates/shp_bl.rds")
```
Read in the geospatial data previously retrieved via `tigris` and Census TIGER/Line web interface.
```{r class.source = 'fold-show'}
# Read in the geospatial data
shp_co <- readRDS("./Data/intermediates/shp_co.rds")
shp_cd <- readRDS("./Data/intermediates/shp_cd.rds")
shp_sldu <- readRDS("./Data/intermediates/shp_sldu.rds")
shp_sldl <- readRDS("./Data/intermediates/shp_sldl.rds")
shp_pl <- readRDS("./Data/intermediates/shp_pl.rds")
shp_bl <- readRDS("./Data/intermediates/shp_bl.rds")
```
Standardize variables.
```{r}
# Use NAMELSAD10 for all geographies. For counties and places, this includes suffixes (e.g., " County") in the name, which is maintained in the ORIG_NAME variable, but removed in the NAME variable, for joining with the non-spatial data
shp_co <- shp_co %>%
mutate(GEOID = GEOID10) %>% # Rename GEOID10 to GEOID to match non-spatial data
mutate(GEOGRAPHY = "County") %>% # Create GEOGRAPHY variable
mutate(ORIG_NAME = NAMELSAD10) %>% # Rename NAMELSAD to NAME to match non-spatial data
mutate(NAME = NAMELSAD10) %>% # Create new NAME variable
mutate(NAME = str_replace(NAME, " County", "")) %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, geometry) # Keep on necessary variables
# Ensure there are no trailing spaces
# aggregate(NAME~NAME1, transform(shp_co, NAME1=NAME), FUN=function(x) nchar(unique(x)))
shp_cd <- shp_cd %>%
mutate(GEOID = GEOID10) %>%
mutate(ORIG_NAME = NAMELSAD10) %>% # Only created to maintain standardized schema
mutate(NAME = NAMELSAD10) %>%
mutate(GEOGRAPHY = "Congressional District") %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, geometry)
shp_sldu <- shp_sldu %>%
mutate(GEOID = GEOID10) %>%
mutate(ORIG_NAME = NAMELSAD10) %>% # Only created to maintain standardized schema
mutate(NAME = NAMELSAD10) %>%
mutate(GEOGRAPHY = "State Legislative District Upper House") %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, geometry)
shp_sldl <- shp_sldl %>%
mutate(GEOID = GEOID10) %>%
mutate(ORIG_NAME = NAMELSAD10) %>% # Only created to maintain standardized schema
mutate(NAME = NAMELSAD10) %>%
mutate(GEOGRAPHY = "State Legislative District Lower House") %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, geometry)
shp_pl <- shp_pl %>%
mutate(GEOID = GEOID10) %>%
mutate(ORIG_NAME = NAMELSAD10) %>% # Rename NAMELSAD to NAME to match non-spatial data
mutate(NAME = NAME10) %>%
mutate(GEOGRAPHY = "Place") %>%
mutate(NAME = str_replace(NAME, " city", "")) %>%
mutate(NAME = str_replace(NAME, " town", "")) %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, geometry)
shp_bl <- shp_bl %>%
mutate(GEOID = GEOID10) %>%
mutate(ORIG_NAME = NAME10) %>% # Only created to maintain standardized schema
mutate(NAME = NAME10) %>%
mutate(GEOGRAPHY = "Block") %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, geometry)
# Union the shapefiles
shp_all <- rbind(shp_co, shp_cd, shp_sldu, shp_sldl, shp_pl, shp_bl)
# Field map
# Geographic identifiers: 4
# Geometry: 1
# Total: 5
# Obs: 197,129 (sum of all geographic entities)
```
Join the wide format attribute data to the geospatial data.
```{r class.source = 'fold-show'}
dpsf_wide_geog_geo <- shp_all %>%
inner_join(dpsf_wide_geog, by = c('GEOGRAPHY' = 'GEOGRAPHY', 'GEOID' = 'GEOID', 'ORIG_NAME' = 'NAME'))
# Field map
# Geographic identifiers: 4
# Variables: 360 (120 * 3 [dp, sf, diff])
# Geometry: 1
# Total: 365
# Obs: 197,129 (sum of all geographic entities)
```
Save out the dpsf wide spatial data frame as a RDS file.
```{r}
# saveRDS(dpsf_wide_geog_geo,"./Data/intermediates/dpsf_wide_geog_geo.rds")
```
The PSU PRC Oregon Population Forecast Program (OPFP) is mandated by law (see `Population Forecast Components section` below) to prepare population forecasts for counties and cities within the State of Oregon. In order to assess the potential effects of differential privacy on the OPFP's forecasts for Urban Growth Boundaries (UGB, the block data must be allocated and aggregated to these larger geographies. Specifically, the block data is spatially allocated to the most recent UGBs (i.e. nearest to forecast launch year [2018]) using a simple centroid allocation method, in which the central point of a block polygon is derived and spatially joined to a UGB or non-UGB county area. For blocks that border or intersect a UGB, the OPFP spatially reviews the areas to ensure that at least 50% of housing units are located within the UGB before assigning it to an urban area (personal communication with Nick Chun, former OPFP Manager, 2019).
Read in the 2018 UGBs (available from the Oregon Spatial Data Library).
```{r results='hide', class.source = 'fold-show'}
shp_UGB2018 <- st_read("./Data/GIS/Oregon Spatial Data Library/UGB_2018.shp")
st_crs(shp_UGB2018) # Check crs
shp_UGB2018 <- shp_UGB2018 %>%
st_transform(st_crs(shp_co)) # Transform into shp_co crs (NAD 83 EPSG 4269)
st_crs(shp_UGB2018)
shp_UGB2018 <- shp_UGB2018 %>%
select(InstName, instCode)
```
Save UGBs as RDS file.
```{r}
# saveRDS(shp_UGB2018,"./Data/intermediates/shp_UGB2018.rds")
```
Spatial join the blocks and 2018 UGBs.
```{r warning=FALSE, results='hide', message=FALSE, class.source = 'fold-show', error=FALSE}
# Create a data frame of only blocks
dpsf_wide_bl_geo <- dpsf_wide_geog_geo %>%
filter(GEOGRAPHY == "Block")
# Convert blocks to centroids
shp_bl_pts <- st_centroid(shp_bl)
# Spatial join block points and 2018 UGBs, and join block points (with UGBs assigned) to block polygons
dpsf_wide_bl_geo <- st_join(shp_bl_pts, shp_UGB2018 %>% # Spatial join block pts and UGBs
select(InstName, instCode)) %>% # Keep only the UGB name and code variables
st_set_geometry(NULL) %>% # Remove the geometry from the block pts data frame
select(GEOID, InstName, instCode) %>% # Keep only the GEOID and UGB name & code variables
inner_join(dpsf_wide_bl_geo, . , by = c("GEOID" = "GEOID")) %>% # Join the block polygons and the block pts (with UGBs assigned)
select(GEOGRAPHY, GEOID, NAME, InstName, instCode, everything())
```
Join blocks and counties to retrieve county name per block.
```{r}
# Create temporary county data frame with geometry set to null.
shp_co_temp <- shp_co %>%
st_set_geometry(NULL) %>% # Remove geometry
mutate(COUNTY_NAME = NAME) %>% # Rename variable
mutate(COUNTY_GEOID = GEOID) %>% # Rename variable
select(COUNTY_GEOID, COUNTY_NAME)
# Join blocks and counties to retrieve county name per block
dpsf_wide_bl_geo <- dpsf_wide_bl_geo %>%
mutate(COUNTY_GEOID = substr(dpsf_wide_bl_geo$GEOID, 0, 5)) %>%
inner_join(shp_co_temp, by = c("COUNTY_GEOID" = "COUNTY_GEOID")) %>% # Inner join based on County GEOID
mutate(UGB_GEOG = case_when(is.na(InstName)~paste0("non-UGB County"), T~"UGB")) %>% # If InstName is NA, then UGB_GEOG is "non-UGB County", else "UGB"
mutate(InstName = case_when(is.na(InstName)~paste0(COUNTY_NAME), T~InstName)) %>% # If InstName is NA, then replace with County name
select(GEOGRAPHY, GEOID, NAME, InstName, instCode, UGB_GEOG, everything())
remove(shp_co_temp)
# Field map of blocks
# Geographic identifiers: 9
# Variables: 360 (120 * 3 [dp, sf, diff])
# Geometry: 1
# Total: 370
# Obs: 196,621 (sum of all geographic entities)
```
Aggregate blocks to UGBs.
```{r warning=FALSE}
dpsf_wide_UGB_nonUGBcounty <- dpsf_wide_bl_geo %>%
st_drop_geometry() %>% # Remove the shape geometry using different method (rather than st_set_geometry(NULL))
select(-GEOGRAPHY, -GEOID, -NAME, -ORIG_NAME, -hh_avgsz_diff, -hh_avgsz_dp, -hh_avgsz_sf, -hu_occpan_occ_perc_diff, -hu_occpan_occ_perc_dp, -hu_occpan_occ_perc_sf, -hu_occpan_vac_perc_diff, -hu_occpan_vac_perc_dp, -hu_occpan_vac_perc_sf, -pop_sex_sexratio_diff, -pop_sex_sexratio_dp, -pop_sex_sexratio_sf) %>% # Remove block geographic identifiers and any calculated fields resulting from division
mutate(GEOGRAPHY = UGB_GEOG) %>%
select(-UGB_GEOG, -COUNTY_GEOID, -COUNTY_NAME) %>%
select(GEOGRAPHY, instCode, InstName, everything()) %>%
group_by(GEOGRAPHY, instCode, InstName) %>% # GroupBy key columns
summarise_all(~ sum(.)) %>% # Sum all block variable values per UGB using lamda
ungroup() # Ungroup to remove grouping functionality
# Calculate additional variables
dpsf_wide_UGB_nonUGBcounty <- dpsf_wide_UGB_nonUGBcounty %>%
mutate(hh_avgsz_dp = pop_hh_total_dp / hu_occpan_occ_tot_dp) %>% # Calculate average household size
mutate(hu_occpan_occ_perc_dp = hu_occpan_occ_tot_dp / hu_occpan_total_dp) %>% # Calculate occupancy percentage
mutate(hu_occpan_vac_perc_dp = hu_occpan_vac_tot_dp / hu_occpan_total_dp) %>% # Calculate vacancy percentage
mutate(pop_sex_sexratio_dp = (pop_sex_m_tot_dp / pop_sex_f_tot_dp) * 100) %>% # Calculate sex ratio
mutate(hh_avgsz_sf = pop_hh_total_sf / hu_occpan_occ_tot_sf) %>% # Calculate average household size
mutate(hu_occpan_occ_perc_sf = hu_occpan_occ_tot_sf / hu_occpan_total_sf) %>% # Calculate occupancy percentage
mutate(hu_occpan_vac_perc_sf = hu_occpan_vac_tot_sf / hu_occpan_total_sf) %>% # Calculate vacancy percentage
mutate(pop_sex_sexratio_sf = (pop_sex_m_tot_sf / pop_sex_f_tot_sf) * 100) %>% # Calculate sex ratio
mutate(hh_avgsz_diff = hh_avgsz_dp - hh_avgsz_sf) %>% # Average household size difference
mutate(hu_occpan_occ_perc_diff = hu_occpan_occ_perc_dp - hu_occpan_occ_perc_sf) %>% # Occupancy percentage difference
mutate(hu_occpan_vac_perc_diff = hu_occpan_vac_perc_dp - hu_occpan_vac_perc_sf) %>% # Vacancy percentage difference
mutate(pop_sex_sexratio_diff = pop_sex_sexratio_dp - pop_sex_sexratio_sf) # Sex ratio difference
# Join aggregated UGB data to UGB shapes
dpsf_wide_UGB_geo <- shp_UGB2018 %>%
inner_join(filter(dpsf_wide_UGB_nonUGBcounty, GEOGRAPHY == "UGB"), by = c("InstName" = "InstName", "instCode" = "instCode")) %>% # Inner join based on InstName
mutate(NAME = InstName) %>%
mutate(GEOID = instCode) %>%
mutate(ORIG_NAME = NAME) %>%
select(-InstName, -instCode) %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, everything())
# Field map of UGBs
# Geographic identifiers: 4
# Variables: 360 (120 * 3 [dp, sf, diff])
# Geometry: 1
# Total: 365
# Obs: 217
```
The inner join above drops the non-UGB county observations. Create spatial objects that represent the non-UGB areas and include corresponding attributes. Dissolve the non-UGB county block shapes. [Reference Dissolve polygons in R](https://philmikejones.me/tutorials/2015-09-03-dissolve-polygons-in-r/ "Reference Dissolve polygons in R")
```{r}
# Create data frame of unincorporated county observations
# shp_nonUGBcounty <- dpsf_wide_bl_geo %>%
# filter(UGB_GEOG == "non-UGB County") # Query non-UGB county observations
# shp_nonUGBcounty$area <- st_area(shp_nonUGBcounty) # Create an area variable
# Dissolve the nonUGB county blocks by county
# shp_nonUGBcounty <- shp_nonUGBcounty %>%
# group_by(InstName) %>% # GroupBy non-UGB county name
# summarise(area = sum(area)) %>% # Summarise/dissolve the shapes
# select(-area) # Remove the area variable
```
Review the non-UGB county spatial object and save out as an RDS file.
```{r}
# ggplot(shp_nonUGBcounty) + geom_sf()
# saveRDS(shp_nonUGBcounty,"./Data/intermediates/shp_nonUGBcounty.rds")
```
Read in the non-UGB county spatial objects.
```{r class.source = 'fold-show'}
shp_nonUGBcounty <- readRDS(file = "./Data/intermediates/shp_nonUGBcounty.rds")
```
Join the non-UGB county spatial objects to the anti-join of the UGB attributes and UGB shape data frames.
```{r class.source = 'fold-show'}
# Join non-UGB county data to non-UGB county shapes
dpsf_wide_nonUGBcounty_geo <- shp_nonUGBcounty %>%
inner_join(filter(dpsf_wide_UGB_nonUGBcounty, GEOGRAPHY == "non-UGB County"), by = c("InstName" = "InstName")) %>%
mutate(NAME = InstName) %>%
mutate(GEOID = instCode) %>%
mutate(ORIG_NAME = NAME) %>%
select(-InstName, -instCode) %>%
select(GEOGRAPHY, GEOID, NAME, ORIG_NAME, everything())
# Field map of non-UGB counties
# Geographic identifiers: 4
# Variables: 360 (120 * 3 [dp, sf, diff])
# Geometry: 1
# Total: 365
# Obs: 36
```
Union the UGB and non-UGB observations with the `dpsf_wide_geog_geo` data frame.
```{r class.source = 'fold-show'}
dpsf_wide_geog_geo <- rbind(dpsf_wide_geog_geo, dpsf_wide_UGB_geo, dpsf_wide_nonUGBcounty_geo)
# Field map
# Geographic identifiers: 4
# Variables: 360 (120 * 3 [dp, sf, diff])
# Geometry: 1
# Total: 365
# Obs: 197,382 (sum of all geographic entities)
```
Save out the UGB/non-UGB county and complete geographies data frames as RDS files.
```{r}
# Save out as RDS file
# saveRDS(dpsf_wide_UGB_nonUGBcounty,"./Data/intermediates/dpsf_wide_UGB_nonUGBcounty.rds")
# saveRDS(dpsf_wide_geog_geo,"./Data/intermediates/dpsf_wide_geog_geo.rds")
```
Remove the geographically subsetted data frames from the environment.
```{r}
# Remove the geographically subsetted data frames
remove(dpsf_wide_bl_geo)
remove(dpsf_wide_UGB_geo)
remove(dpsf_wide_nonUGBcounty_geo)