forked from DataS-DHSC/coffee-and-coding
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2021-03-04 Organising survey data for beginners
81 lines (62 loc) · 2.87 KB
/
2021-03-04 Organising survey data for beginners
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
##Reshaping data###
####Coffee and Coding######
#load libraries
library(tidyverse)
library(purrr)
library(openxlsx)
library(magrittr)
#read in tidy data
tidy_data <- read.csv('./output/tidy_data.csv', stringsAsFactors = FALSE)
#mutate the publication date column in date format
data_tidy<- tidy_data %>%
dplyr::mutate(pub_date = as.Date(pub_date, format = '%d %B %Y'))
#select a question you want to look at
keyquestions <- c('support or oppose')
#filter data by that question
key_d <- data_tidy %>%
dplyr::filter(stringr::str_detect(
question, paste0(keyquestions, collapse = "|"))) #this paste0 is in place if you have more than one pattern
#checking that I only have that question
print(unique(key_d$question))
#check the answer types for that question
answer_types <- key_d %>%
group_by(answer) %>% summarise(n=n()) %>%
ungroup() %>% arrange(-n)
#you can export data splitting by category or answer types
category_types <- key_d %>%
group_by(category) %>% summarise(n=n()) %>%
ungroup() %>% arrange(-n)
##I am going to export my data with the categories in each tab
#so 1 tab in excel = 1 category of surveyed sample
#each tab will have different answer types for that category
# Create data frames for each category of sample
all <- dplyr::filter(key_d, grepl('All person', category, ignore.case = TRUE)) %>%
na.omit(value)
age <- dplyr::filter(key_d, grepl('^(\\d\\d)', category) &
!grepl('Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', category)) %>%
na.omit(value)
print(unique(age$category))
gender <- dplyr::filter(key_d, grepl('Men|Women', category,ignore.case = TRUE)) %>%
na.omit(value)
region <- dplyr::filter(key_d,
grepl('^(Eng|Scot|Wal|South|East|West|North|York|Lond)',
category)) %>% na.omit(value)
#making a list of all data frames
data_cat <- list(all = all,
age = age,
gender = gender,
region = region)
#let's view what 1 of my excel tabs will look like
view(data_cat$all)
#sort through my list of data frames
final <- purrr::map(data_cat, ~ arrange(.x, pub_date, question, category))
final2 <- purrr::map(final, ~ select(.x, pub_date, question, category, answer, value, file, sheet, survey_date))
view(final$all)
#export to excel with the sheet name = each data frame name in list
for (i in 1:length(final2)){
openxlsx::write.xlsx(final2,
file = paste0(keyquestions,"_",
" ", Sys.Date(), ".xlsx"),
sheetNames = names(final2[i])
)
}