Joshua Cook 10/15/2020
knitr::opts_chunk$set(echo = TRUE, comment = "#>")
Data dictionary:
big_epa_cars <- readr::read_csv("") %>%
big_cars <- big_epa_cars %>%
filter( %>%
select(id, make, model, year, eng_id, barrels08, city08, highway08,
co2, cylinders, displ, drive, fuel_cost08) %>%
distinct() %>%
set_names(c("id", "mfr", "model", "year", "engine_id", "barrels",
"city_mpg", "highway_mpg", "co2", "cylinders", "displ",
"drive", "fuel_cost"))
#> # A tibble: 41,804 x 13
#> id mfr model year engine_id barrels city_mpg highway_mpg co2
#> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 Alfa… Spid… 1985 9011 15.7 19 25 -1
#> 2 10 Ferr… Test… 1985 22020 30.0 9 14 -1
#> 3 100 Dodge Char… 1985 2100 12.2 23 33 -1
#> 4 1000 Dodge B150… 1985 2850 30.0 10 12 -1
#> 5 10000 Suba… Lega… 1993 66031 17.3 17 23 -1
#> 6 10001 Suba… Loya… 1993 66020 15.0 21 24 -1
#> 7 10002 Suba… Loya… 1993 66020 13.2 22 29 -1
#> 8 10003 Toyo… Coro… 1993 57005 13.7 23 26 -1
#> 9 10004 Toyo… Coro… 1993 57005 12.7 23 31 -1
#> 10 10005 Toyo… Coro… 1993 57006 13.2 23 30 -1
#> # … with 41,794 more rows, and 4 more variables: cylinders <dbl>, displ <dbl>,
#> # drive <chr>, fuel_cost <dbl>
big_cars %>%
sample_n(300) %>%
select(barrels:fuel_cost) %>%
GGally::ggscatmat(alpha = 0.6)
The co2
column will be removed because is missing a lot of data.
big_cars %>%
filter(co2 == -1) %>%
sample_n(10) %>%
select(mfr, model)
#> # A tibble: 10 x 2
#> mfr model
#> <chr> <chr>
#> 1 Acura SLX
#> 2 GMC Rally G35 2WD
#> 3 Ford Focus
#> 4 Nissan Xterra 4WD
#> 5 Mercury Topaz
#> 6 Suzuki Vitara 2Door
#> 7 Porsche Targa
#> 8 Pontiac 6000 Wagon
#> 9 Chevrolet Colorado 2WD
#> 10 Chevrolet G10/20 Van 2WD
The fuel_cost
and barrels
columns are highly correlated, and the
and highway_mpg
are highly correlated. Therefore, we can
remove one column of each pair.
big_cars %<>%
select(-fuel_cost, -city_mpg, -co2)
big_cars %>%
#> # A tibble: 8 x 2
#> drive n
#> <chr> <int>
#> 1 2-Wheel Drive 507
#> 2 4-Wheel Drive 1578
#> 3 4-Wheel or All-Wheel Drive 6648
#> 4 All-Wheel Drive 3222
#> 5 Front-Wheel Drive 14400
#> 6 Part-time 4-Wheel Drive 270
#> 7 Rear-Wheel Drive 13990
#> 8 <NA> 1189
big_cars %<>%
filter(! %>%
mutate(drive = case_when(
drive == "4-Wheel or All-Wheel Drive" ~ "4-Wheel Drive",
drive == "All-Wheel Drive" ~ "4-Wheel Drive",
drive == "Part-time 4-Wheel Drive" ~ "4-Wheel Drive",
drive == "Front-Wheel Drive" ~ "2-Wheel Drive",
drive == "Rear-Wheel Drive" ~ "2-Wheel Drive",
drive == "Front-Wheel Drive" ~ "2-Wheel Drive",
TRUE ~ drive
big_cars %>%
#> # A tibble: 2 x 2
#> drive n
#> <chr> <int>
#> 1 2-Wheel Drive 28897
#> 2 4-Wheel Drive 11718
Remove outliers of highway_mpg
big_cars %>%
filter(highway_mpg <= 50) %>%
ggplot(aes(x = highway_mpg)) +
geom_density() +
labs(x = "highway MPG",
y = "density")
big_cars %<>% filter(highway_mpg <= 50)
lm_model <- lm(highway_mpg ~ cylinders + displ + drive,
data = big_cars)
#> Call:
#> lm(formula = highway_mpg ~ cylinders + displ + drive, data = big_cars)
#> Residuals:
#> Min 1Q Median 3Q Max
#> -15.5808 -2.6701 -0.2647 2.3954 20.7525
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 34.84890 0.07215 483.021 < 2e-16 ***
#> cylinders -0.10131 0.02641 -3.836 0.000125 ***
#> displ -2.88676 0.03433 -84.090 < 2e-16 ***
#> drive4-Wheel Drive -1.96203 0.04432 -44.269 < 2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> Residual standard error: 3.993 on 40366 degrees of freedom
#> (9 observations deleted due to missingness)
#> Multiple R-squared: 0.5364, Adjusted R-squared: 0.5364
#> F-statistic: 1.557e+04 on 3 and 40366 DF, p-value: < 2.2e-16
stan_model <- stan_glm(highway_mpg ~ cylinders + displ + drive,
data = big_cars)
posteriors <- insight::get_parameters(stan_model) %>%
as_tibble() %>%
#> # A tibble: 4,000 x 4
#> intercept cylinders displ drive4_wheel_drive
#> <dbl> <dbl> <dbl> <dbl>
#> 1 34.8 -0.0788 -2.91 -1.93
#> 2 34.9 -0.103 -2.89 -2.01
#> 3 34.8 -0.101 -2.88 -2.01
#> 4 34.9 -0.116 -2.87 -1.92
#> 5 34.8 -0.0944 -2.90 -1.99
#> 6 34.9 -0.142 -2.83 -1.99
#> 7 34.8 -0.0854 -2.90 -1.95
#> 8 35.0 -0.130 -2.88 -1.88
#> 9 34.9 -0.128 -2.84 -2.00
#> 10 34.8 -0.108 -2.87 -1.92
#> # … with 3,990 more rows
posteriors %>%
pivot_longer(intercept:drive4_wheel_drive) %>%
ggplot(aes(x = value)) +
facet_wrap(~ name, scales = "free") +
geom_density() +
scale_y_continuous(expand = c(0, 0))
#> # Description of Posterior Distributions
#> Parameter | Median | CI | CI_low | CI_high | pd | ROPE_CI | ROPE_low | ROPE_high | ROPE_Percentage | Rhat | ESS
#> --------------------------------------------------------------------------------------------------------------------------
#> (Intercept) | 34.850 | 89 | 34.735 | 34.965 | 1 | 89 | -0.586 | 0.586 | 0 | 1.000 | 3605
#> cylinders | -0.102 | 89 | -0.144 | -0.059 | 1 | 89 | -0.586 | 0.586 | 1 | 1.001 | 2608
#> displ | -2.885 | 89 | -2.941 | -2.829 | 1 | 89 | -0.586 | 0.586 | 0 | 1.001 | 2769
#> drive4-Wheel Drive | -1.963 | 89 | -2.030 | -1.893 | 1 | 89 | -0.586 | 0.586 | 0 | 0.999 | 4619
#> # Proportion of samples inside the ROPE [-0.10, 0.10]:
#> inside ROPE
#> -----------
#> 46.53 %
#> # Proportion of samples inside the ROPE [-0.10, 0.10]:
#> inside ROPE
#> -----------
#> 0.00 %
#> pd = 100.00%