-
Notifications
You must be signed in to change notification settings - Fork 14
/
tidying_automation_homework_key_jake.R
148 lines (112 loc) · 5.62 KB
/
tidying_automation_homework_key_jake.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#PSYC 259 Homework 3 - Data Tidying and Automation
#This assignment should be completed in RStudioCloud
#For full credit, provide answers for at least 6/9 questions
#List names of students collaborating with:
### SETUP: RUN THIS BEFORE STARTING ----------
install.packages("tidyverse") #If not installed
#Load packages
library(tidyverse)
paths <- c("https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Fellowship_Of_The_Ring.csv",
"https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Two_Towers.csv",
"https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Return_Of_The_King.csv")
#Read data
#Each dataset has the words spoken by male/female characters in the LOTR triology by race (elf, hobbit, or human)
ds1 <- read_csv(paths[1])
ds2 <- read_csv(paths[2])
ds3 <- read_csv(paths[3])
ds_combined <- bind_rows(ds1, ds2, ds3)
### Question 1 ----------
#For this assignment, you created a fork from the Github repo and cloned your own copy
#As you work on the assignment, make commits and push the changes to your own repository.
#Make your repository public and paste the link here:
#ANSWER
#YOUR GITHUB LINK:
### Question 2 ----------
#Use a for loop with paths to read the data in to a new tibble "ds_loop" so that the data are combined into a single dataset
#(Yes, Vroom does this automatically but practice doing it with a loop)
#If you did this correctly, it should look the same as ds_combined created above
#ANSWER
# initializing empty tibble with column headers
ds_loops <- read_csv(paths[1]) %>% filter(FALSE)
for (p in paths) {
ds_loops <- bind_rows(ds_loops, read_csv(p))
}
## JAKE
## predefine way; more of a pain but with larger problems will be faster
x <- names(ds_loops) # store column names
dsAmt <- length(paths) # how many dataframes are we importing
dsLen <- nrow(ds1) # number of rows
ds_dat <- as.data.frame(matrix( ncol = ncol(ds_loops), nrow = dsLen*dsAmt )) # define dataframe
# as number of columns from initial dataframe and number of rows of initial dataframe times
# number of dataframes being imported
for(k in 0:(length(paths)-1) ) {
i<-(1+3*k) # every 3rd row is the beginning of where you insert your dataframe
e<-i+(dsLen-1) # every 3 row plus 2 is the length of what is being improved
ds_dat[i:e, ] <- read.csv(paths[(k+1)]) # insert by position rather than bind rows
}
colnames(ds_dat) <- x # assign names
### Question 3 ----------
#Use map with paths to read in the data to a single tibble called ds_map
#If you did this correctly, it should look the same as ds_combined created above
#ANSWER
ds_map <- map_dfr(paths, read_csv)
### Question 4 ----------
#The data are in a wider-than-ideal format.
#Use pivot_longer to reshape the data so that sex is a column with values male/female and words is a column
#Use ds_combined or one of the ones you created in Question 2 or 3, and save the output to ds_longer
#ANSWER
ds_longer <- pivot_longer(ds_combined, Male:Female, names_to = "Sex", values_to = "Words")
### Question 5 ----------
#It's helpful to know how many words were spoken, but each book was a different length
#The tibble below contains the total number of words in each book (make sure to run those lines so that it appears in your environment)
#Merge it into ds_longer and then create a new column that expresses the words spoken as a percentage of the total
total_words <- tibble(Film = c("The Fellowship Of The Ring", "The Two Towers","The Return Of The King"),
Total = c(177277, 143436, 134462))
#ANSWER
ds_longer <- ds_longer %>% left_join(total_words, by = "Film") %>%
mutate(Percent = Words/Total * 100)
### Question 6 ----------
#The function below creates a graph to compare the words spoken by race/sex for a single film
#The input for the function is a tibble that contains only a single film
#Write a for loop that iterates through the film names to apply the function to a subset of ds_longer (each film)
#Run all 6 lines code below to define the function (it should show in your environment after running)
words_graph <- function(df) {
p <- ggplot(df, aes(x = Race, y = Words, fill = Sex)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle(df$Film) + theme_minimal()
print(p)
}
#ANSWER
films <- unique(ds_longer$Film) #Or write them all out
for (film in films) {
ds_longer %>% filter(Film == film) %>% words_graph
}
## JAKE, base R analogue
for(film in films){
words_graph(ds_longer[ds_longer$Film==film,])
}
### Question 7 ----------
#Apply the words_graph function again, but this time
#use split and map to apply the function to each film separately
#ANSWER
ds_longer %>% split(.$Film) %>% map(words_graph)
### Question 8 ----------
#The PI wants a .csv file for each film with a row for male and a row for female
#and separate columns for the words spoken by each race and the percentage of words spoken by each race
#First, get the data formatted in the correct way
#From ds_longer, create a new tibble "ds_wider" that has columns for words for each race and percentage for each race
#ANSWER
ds_wider <- ds_longer %>% pivot_wider(names_from = Race, values_from = c(Words, Percent))
### Question 9 ---------
#Using your new "ds_wider" tibble, write the three data files using either a for loop or map
#The files should be written to "data_cleaned" and should be named by film title
#ANSWER - Map
films <- unique(ds_wider$Film)
map(films, ~ write_csv(filter(ds_wider, Film == .x), paste0("data_cleaned/",.x,".csv")))
#base r sorta
apply( df, function(x) x*2 )
#ANSWER - for loop
films <- unique(ds_wider$Film)
for (film in films) {
ds_wider %>% filter(Film == film) %>% write_csv(paste0("data_cleaned/",film,".csv"))
}