-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy path01 data.frame.R
382 lines (230 loc) · 9.84 KB
/
01 data.frame.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# Data frames -------------------------------------------------------------
# data frames are the most common way to store and work with DATA. If you're
# familiar with excel or SPSS (O`_`O), this should feel natural.
#
# In data frames, each column is a vector of some type, with (ideally) each
# vector represents a "variable", and each row represents some "observation".
# we can make a data frame with the function `data.frame()`
school_grades <- data.frame(
names = c("Dana", "Avi", "Michal", "Asaf", "Jody", "Beth", "Moshe"),
id = c(305850916, 381345273, 203912400, 229889795,
304786643, 317171280, 326876070),
sex = factor(c("F", "M", "F", "M", "F", "F", "M"),
labels = c("female", "male")),
math.grades = c(93, 30, NA, 88, 100, 67, 79),
english.grades = c(100, 45, 90, 77, 88, 90, 66)
)
school_grades
# Some useful function to explore data frames:
str(school_grades) # see data structure
head(school_grades, n = 3) # get first few rows - useful when printing very long data frames
tail(school_grades, n = 3) # get last few rows
ncol(school_grades) # how many columns?
nrow(school_grades) # how many rows?
View(school_grades) # view it in R's viewer.
## extract & replace with [row, column]
school_grades[1, ] # first row
school_grades[, 1] # first column
school_grades[3, 5] # 3rd row, 5th column
# What will this do?
school_grades[c(1, 2, 3, 1, 1, 1), ]
# many ways to do the same thing...
school_grades[4, 5]
school_grades[4, english.grades]
school_grades$english.grades[4]
# school_grades[["english.grades"]][4]
# school_grades[, 5][4]
# school_grades[, "english.grades"][4]
# change and add variables
school_grades[c(2, 3, 6), 2] <- NA
school_grades$pass.english <- school_grades$english.grades >= 56
school_grades$english.grades_bonus <- school_grades$english.grades + 10
school_grades$math.grades_z <- scale(school_grades$math.grades)
school_grades
mean(school_grades$english.grades)
sd(school_grades$english.grades)
# What do these do?
school_grades[school_grades$sex == "female", c("names", "math.grades", "math.grades_z")]
school_grades[school_grades$pass.english, c("names", "english.grades")]
mean(school_grades$math.grades[school_grades$pass.english])
school_grades_clean <- na.omit(school_grades)
school_grades_clean
# The ***tidyverse*** ----------------------------------------------------
# The tidyverse is an opinionated collection of R packages designed for data
# [analysis]. All packages share an underlying design philosophy, grammar, and
# data structures - "TIDY DATA"!
## What is tidy data?
# 1. Each variable forms one column.
# 2. Each observation forms one row.
# install.packages(c("tidyverse", "haven"))
library(haven) # for importing and exporting 'SPSS' file :(
library(tidyverse)
# You only need to install packages once, but you need to load them (with
# `library`) every time you open R.
# Importing data ----------------------------------------------------------
# Typically, we won't build our data frame in R - we will import data into R,
# and then manipulate it to make it compatible with our needs - modeling,
# plotting, summary tables, etc (we will learn all of these there upcoming
# weeks!)
# load a data frame
data_raw <- read.csv("data/deaf_numer.csv")
# for SPSS files
data_raw <- read_spss("data/deaf_numer.sav")
# see also the `readxl` pkg for excel files.
str(data_raw)
glimpse(data_raw) # better!
pairs(data_raw[1:100, ])
# emotional_1back:
# sID - subject number
# nFingers - Number of stimulated fingers
# trial - experimental trial
# block - experimental block
# acc - accuracy: 1 correct, 0 error
# rt - reaction time
## How does R know where the file is? ------------------
# There are several methods for letting R know where your files are:
# 1. Using an RStudio project - when in a project, R searches within the project
# folder.
# 2. Setting the folder R will look in (called the working directory) manually:
getwd() # Where is R looking now?
setwd("C:/Users/Mattan/Documents") # set it to something specific
# (You might find the `here` package to be useful too.)
# 3. Giving the full file path:
data_raw <- read.csv("C:/Users/Mattan/Documents/R/PAiR/02 data wrangling/data/deaf_numer.sav")
# You can use `choose.files()` or `choose.dir()` to interactively get the full
# path to a file or folder.
# Of these options, I recommend them in the order in which they're presented.
# (Make sure you use the right / and not \ for file paths!)
# Manipulating Data -------------------------------------------------------
# `dplyr` has some very useful functions for manipulating your data.
# The first argument in ALL of these functions is a data frame (e.g., data_raw)
## select columns
data_clean <- select(data_raw,
sID, nFingers, rt)
head(data_clean)
## filter -- selects rows:
data_clean <- filter(data_clean,
rt < 2500)
nrow(data_clean)
nrow(data_raw)
## mutate -- makes a new variable, or change an existing one
data_clean <- mutate(data_clean,
sqrt_rt = sqrt(rt), # new
rt = rt / 1000 # change RT from ms to seconds
)
head(data_clean)
# group_by -- group data by some variable.
data_clean <- group_by(data_clean,
nFingers)
# This doesn't actually change the data in any way, it just lets other functions
# know that they should act on the data according to the groups.
group_keys(data_clean) # see what is grouped by
# For example, mutate():
data_clean <- mutate(data_clean,
rt_z = scale(rt)
)
# What did this do?
## ALWAYS ungroup when you're done with grouping!
data_clean <- ungroup(data_clean)
group_keys(data_clean)
View(data_clean)
# for even more functions, see the dplyr cheatsheet:
# https://www.rstudio.com/resources/cheatsheets/
# However, for very large data-sets (say, more than 1,000,000 rows) you might
# want to consider the `data.table` or `dtplyr` packages (not covered here).
# There are many packages that can help with manipulating, recoding and
# transforming data.
#
# `dplyr` itself has some useful functions that can be used inside `mutate()`
# functions (https://dplyr.tidyverse.org/reference/index.html#section-vector-functions),
# and another real powerhouse is the `sjmisc` package - see examples: http://strengejacke.de/sjmisc-cheatsheet.pdf.
## Piping with |> ("and then") -------------------------------------------
# The aim of the pipe (|>) is to make code more human readable.
# For example this:
sqrt(mean(c(1,2,3,4,NA), na.rm = TRUE))
# is not very readable - it is read from in -> out...
# even this (which does the same thing), isn't really readable - why are we
# reading from the inside out??? And not from left to right??
sqrt(
mean(
c(1,2,3,4,NA),
na.rm = TRUE
)
)
# But using the pipe...
c(1,2,3,4,NA) |>
mean(na.rm = TRUE) |>
sqrt()
# amazing!
# The pipe tells R: "When you're done with the stuff on the LEFT, pass the
# result to next thing on the RIGHT...".
# When reading code aloud we will say "and then".
# The pipe will always* "send" the results from the left, into the FIRST unnamed
# argument of the function on the right.
# If we want to send to another argument, we can use the _ placeholder:
TRUE |>
mean(c(1,2,3,4,NA), na.rm = _) |>
sqrt()
# The pipe really shines when used with functions that share the tidyverse
# philosophy. For example, because the `dplyr` function all take a *data frame*
# as the FIRST argument, and also all RETURN a data frame, we can PIPE `dplyr`
# functions:
data_clean_piped <- data_raw |>
select(sID, nFingers, rt) |>
filter(rt < 2500) |>
mutate(
sqrt_rt = sqrt(rt),
rt = rt / 1000
) |>
group_by(nFingers) |>
mutate(
rt_z = scale(rt)
) |>
ungroup()
# This pipe does all the things we did above:
all.equal(data_clean, data_clean_piped)
# Sometimes you will see this type of pipe %>% - it is sort of an older version
# of the |> pipe, and in 99% of cases they both do the same thing.
# Export data -------------------------------------------------------------
# save to a `.csv` file
write.csv(data_clean_piped, file = "data_clean.csv") # read.csv() into object
# save to a `.sav` file
write_sav(data_clean_piped, path = "data_clean.sav")
# BUT WHY??????????? NOOOOOOOOOO
# save to a `.rds` file
saveRDS(data_clean_piped, file = "data_clean.Rds")
# load using readRDS() into object.
same_data <- readRDS("data_clean.Rds")
# why would you want to do this? (e.g., factors...)
# not only data frames:
xlist <- list(a = 1, b = list(b1 = c(1, 2, 3), bx = "a"))
saveRDS(xlist, file = "some list I made.Rds")
# we can also save multiple objects into `.rdata` files (BUT DON'T!!):
save(data_clean_piped, xlist, file = "selected_objects.rdata")
# Or the whole current environment (Don't!!!!)
save.image(file = "all_objects.rdata")
#
# load using load() into environment
# Exercise ----------------------------------------------------------------
data_raw <- read.csv("data/deaf_numer.csv")
# (Try to do the following with dplyr functions.)
# 1. Create a `Group` variable: (the RA forgot to do it...)
# - For Subject (`sID`) <= 15, Group should be 1,
# - For Subject > 15, Group should be 2.
# TIP: use `ifelse()`
# (see `02 flow control.R` from last lesson)
# 2. remove the first, practice block (Where block == 1)
# 3. remove trials following an error
# TIP: use `lag()`
# 4. remove error trials (where `acc` == 0)
# 5. remove RTs that fall beyond +/- 2 SD from *each participant's*
# mean in *each* of the "finger" conditions.
# 6. create the variable `vib_strength`, randomly sampled from
# `c(soft = 0.3, strong = 1.0)`
# 7. Try doing steps 1--6 with the pipe (you can copy your solution and just
# adjust it to work with the pipe.
# 8. Save that data to:
# - an Rds file
# - a csv file
# 9. Rewrite this ugly code using the pipe (|>):
diff(range(sample(head(iris[[1]], n = 10), size = 5, replace = TRUE)))