-
Notifications
You must be signed in to change notification settings - Fork 0
/
un_data_anaylsis.R
166 lines (127 loc) · 5.74 KB
/
un_data_anaylsis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
library(tidyverse)
gapminder_data <- read_csv("data/gapminder_data.csv")
# summerise() - summarise data frame
summarise(gapminder_data, averagelifeExp=mean(lifeExp))
gapminder_data_summarized <- gapminder_data %>% summarise(averagelifeExp=mean(lifeExp))
gapminder_data_summarized
#picking out variables from data
gapminder_data %>% summarise(recent_year = max(year))
rlang::last_trace()
# filter() -subsets the rows in a dataframe
gapminder_data %>%
filter(year == 2007) %>%
summarise(averagelifeExp=mean(lifeExp))
#Find average GDP per capita for the first year in the data set
gapminder_data %>%
filter(year == 1952) %>%
summarise(averageGDP=mean(gdpPercap))
gapminder_data %>% summarise(first_year=min(year))
gapminder_data %>%
filter(year==1952) %>%
summarise(average_gdp=mean(gdpPercap))
# group_by() - group values from a column
gapminder_data %>%
group_by(year) %>%
summarise(average_lifeExp=mean(lifeExp))
# Calc the average life expectancy by continent
gapminder_data %>%
group_by(continent) %>%
summarise(average_lifeExp=mean(lifeExp))
gapminder_data %>%
group_by(continent) %>%
summarise(average_lifeExp=mean(lifeExp), min_lifeExp=min(lifeExp))
# mutate() - add or change a variable/column in a data frame
gapminder_data %>%
mutate(gdp = pop * gdpPercap)
# making a new column called popInMillions that is the population in millions
gapminder_data_with_mutation <- gapminder_data %>%
mutate(popInMillions = pop / 1000000)
# select() - subsets columns in a dataframe
gapminder_data %>%
select(pop, year)
gapminder_data %>%
select(-continent)
# create a data frame with only country, continent, year, and lifeExp
gapminder_data %>%
select(-pop, -gdpPercap)
gapminder_data %>%
select(country, continent, year, lifeExp)
# pivot_wider() & pivot_longer - changes shape of data frame
gapminder_data %>%
select(-pop, -gdpPercap) %>%
pivot_wider(names_from = year, values_from = lifeExp)
#subsetting the gapminder_data to the year 2007 and drop the year and continent columns
gapminder_data %>%
filter(year == 2007) %>%
select(-year, -continent)
# data that is only from 2007 and only from the americas continent
gapminder_data %>%
filter(year == 2007) %>%
filter(continent == "Americas") %>%
select(-year, -continent)
gapminder_data_2007_Americas <- gapminder_data %>%
filter(year == 2007 & continent == "Americas") %>%
select(-year, -continent)
# is CO2 emissions related to GDP
co2_emssions_dirty <- read_csv("data/co2-un-data.csv", skip = 2,
col_names = c("region", "country", "year", "series", "value", "footnotes", "source"))
co2_emssions_dirty %>%
select(country, year, series, value) %>%
mutate(series = recode(series, "Emissions (thousand metric tons of carbon dioxide)" = "total_emissions",
"Emissions per capita (metric tons of carbon dioxide)" = "per_capita_emissions")) %>%
pivot_wider(names_from = series, values_from = value)
# now each year and country pair have one row, total_emissions and per_capita_emissions have columns
co2_emssions_dirty %>%
select(country, year, series, value) %>%
mutate(series = recode(series, "Emissions (thousand metric tons of carbon dioxide)" = "total_emissions",
"Emissions per capita (metric tons of carbon dioxide)" = "per_capita_emissions")) %>%
pivot_wider(names_from = series, values_from = value) %>%
count(year)
co2_emissions <- co2_emssions_dirty %>%
select(country, year, series, value) %>%
mutate(series = recode(series, "Emissions (thousand metric tons of carbon dioxide)" = "total_emissions",
"Emissions per capita (metric tons of carbon dioxide)" = "per_capita_emissions")) %>%
pivot_wider(names_from = series, values_from = value) %>%
filter(year == 2005) %>%
select(-year)
View(co2_emissions)
# inner_join()
inner_join(gapminder_data_2007_Americas, co2_emissions)
anti_join(gapminder_data_2007_Americas, co2_emissions)
#make names common between data sets
co2_emissions <- co2_emssions_dirty %>%
select(country, year, series, value) %>%
mutate(series = recode(series, "Emissions (thousand metric tons of carbon dioxide)" = "total_emissions",
"Emissions per capita (metric tons of carbon dioxide)" = "per_capita_emissions")) %>%
pivot_wider(names_from = series, values_from = value) %>%
filter(year == 2005) %>%
select(-year) %>%
mutate(country = recode(country,
"Bolivia (Plurin. State of)" = "Bolivia",
"United States of America"= "United States",
"Venezuela (Boliv. Rep. of)"= "Venezuela"))
View(co2_emissions)
anti_join(gapminder_data_2007_Americas, co2_emissions)
gapminder_data_2007_Americas <- gapminder_data %>%
filter(year == 2007 & continent == "Americas") %>%
select(-year, -continent) %>%
mutate(country = recode(country, "Puerto Rico" = "United States"))
View(gapminder_data_2007_Americas)
gapminder_data_2007_Americas <- gapminder_data %>%
filter(year == 2007 & continent == "Americas") %>%
select(-year, -continent) %>%
mutate(country = recode(country, "Puerto Rico" = "United States")) %>%
group_by(country) %>%
summarise(lifeExp = sum(lifeExp * pop) / sum(pop),
gdpPercap = sum(gdpPercap * pop) / sum(pop),
pop = sum(pop))
View(gapminder_data_2007_Americas)
anti_join(gapminder_data_2007_Americas, co2_emissions)
gapminder_co2 <- inner_join(gapminder_data_2007_Americas, co2_emissions)
View(gapminder_co2)
# is CO2 emissions related to GDP
ggplot(gapminder_co2, aes(x = gdpPercap, y = per_capita_emissions)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "GDP per capita)", y = "CO2 emitted (per capita)")
write_csv(gapminder_co2, "data/gapminder_co2.csv")