-
Notifications
You must be signed in to change notification settings - Fork 0
/
ADD2024_day2_pm.R
129 lines (94 loc) · 4.58 KB
/
ADD2024_day2_pm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
library(tidyverse)
library(readxl)
# FILTERing is on rows.
# importing the second sheet of a multisheet Excel file:
dat2 <- read_xlsx("livestock_data.xlsx", sheet = 2)
# filter the rows corresponding to households with > 10yr
# experience (in hh_experience variable)
dat2 |> filter(hh_experience > 10) # 303 obs.
# one can combine logical filters:
# the logical AND operation is "&",
# the logical OR operation is "|"
# If I want households with > 10yr experience AND
# anydairy equal to 1:
dat2 |> filter(hh_experience > 10 & anydairy == 1) # 231 observations
###
# Working with columns (variables)
###
# select() to select columns
dat2 |> select(education,hh_experience)
# selection can be done according to numerical indices:
dat2 |> select(1:3,6) # selecting first three plus the sixth column
# we can select using the helper where(), to select
# based on a predicate (i.e. the result of a function
# saying "TRUE" or "FALSE" on the columns)
is.numeric("hello") # FALSE
is.character("hello") # TRUE
# so: we can select the columns of type character
dat2 |> select(where(is.character))
# You can select negatively:
dat2 |> select(!where(is.character)) # select all columns NOT being of type character
dat2 |> select(-qno) # select every column except qno
dat2 |> select(starts_with("off")) # select all the columns whose name starts with "off"
dat2 |> select(contains("farm")) # all the columns containing "farm" somewhere in their name
dat2 |> select(!contains("_")) # select all the columns except those whose name contains "_"
# mutate() to create new variables
# based on the existing content of the dataset, or not:
dat2 |> mutate(dummy5 = 5) -> toto
# add a column being the sum of other two cols:
dat2 |> mutate(sum_F_and_D = fem_totadult + depend_ratio) |> View()
# we can control the place where the new column appears:
dat2 |> mutate(sum_F_and_D = fem_totadult + depend_ratio, .after = depend_ratio) |> View()
# about the sample() function
#
# The sample() function is used to sample values
# from a vector, with or without replacement.
set.seed(1982) # necessary if you want reproducibility
sample(1:1000, size = 20) # pick 20 integers between 1 and 1000, without replacement
sample(1:10, size = 20) # error: not enough elements to pick from
sample(1:10, size = 20, replace = TRUE) -> series
# about the ifelse() function
# ifelse takes three arguments:
# - a logical test (returning TRUE or FALSE)
# - an expression that will be returned when the test returns TRUE
# - an expression that will be returned when the test returns FALSE
ifelse(5>3, "yes", "no")
ifelse(5<3, "yes", "no")
# ifelse can be used on vectors:
ifelse(series>3, "yes", "no")
# this could not be done with the builtin if... else construct:
if (5>3) "yes" else "no" # with just one value, it's ok
if (series>3) "yes" else "no" # not ok: condition has length 20
# we actually have to write:
ifelse(series>3, "yes", "no")
# now, we use ifelse() in a mutate() call:
# EXERCISE: Write, immediately on the right of hh_experience,
# a new variable called hh_experience_threshold
# that contains the value 1 if the experience is >= 10 years,
# and 0 otherwise.
dat2 |> mutate(hh_experience_threshold = ifelse(hh_experience >= 10, 1, 0), .after = hh_experience) -> dat2
# summarize() function
mtcars |> as_tibble() |> summarize(avg = mean(mpg))
mean(mtcars$mpg) # checking the value
# we can do multiple summaries at once:
mtcars |> as_tibble() |> summarize(avg = mean(mpg), max_cyl = max(cyl), stdev_displacement = sd(disp, na.rm = T))
# grouping data to summarize meaningfully:
mtcars |> as_tibble() |> group_by(gear)
# grouping in itself doesn't do much to the data,
# it only prepares future summaries to work in a groupwise
# fashion. Grouping DOES NOT change the order of rows.
mtcars |> as_tibble() |> group_by(gear) |> summarize(n = n(), avg = mean(mpg), max_cyl = max(cyl), stdev_displacement = sd(disp, na.rm = T))
# grouping and summarizing in dat2
dat2 |> group_by(depend_ratio) # 50 groups!
dat2 |> pull(depend_ratio) |> unique() |> length()
# it is not usually appropriate to group on a numerical value,
# especially a continuous one
dat2 |> group_by(sex) |> summarize(mean_age = mean(age, na.rm = 1))
# using mutate to create a variable with imputation of NA
# values
# off_farm_activity contains some NA values:
dat2 |> pull(off_farm_act) |> is.na() |> sum() # 43
# we want to replace these values with the median of all
# other values in the column.
median(dat2$off_farm_act, na.rm = T)
dat2 |> mutate(off_farm_act_imputed = ifelse(is.na(off_farm_act), median(off_farm_act, na.rm = T), off_farm_act), .after = off_farm_act) # |> filter(is.na(off_farm_act)) |> View()