Joshua Cook 1/21/2020
knitr::opts_chunk$set(echo = TRUE, comment = "#>")
library(conflicted)
library(patchwork)
library(GGally)
library(factoextra)
library(tictoc)
library(lubridate)
library(vip)
library(magrittr)
library(tidyverse)
conflict_prefer("filter", "dplyr")
theme_set(theme_minimal())
set.seed(0)
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv') %>%
mutate(track_album_release_date = ymd(track_album_release_date))
#> Parsed with column specification:
#> cols(
#> .default = col_double(),
#> track_id = col_character(),
#> track_name = col_character(),
#> track_artist = col_character(),
#> track_album_id = col_character(),
#> track_album_name = col_character(),
#> track_album_release_date = col_character(),
#> playlist_name = col_character(),
#> playlist_id = col_character(),
#> playlist_genre = col_character(),
#> playlist_subgenre = col_character()
#> )
#> See spec(...) for full column specifications.
#> Warning: 1886 failed to parse.
A few dates failed to parse because they just have the year, not month-day-year. I do not plan to use this feature as an input for the model, so I will not worry about the parsing failures.
Some track_ids
appear multiple times. This is likely caused by the
same song being in multiple playlists. Annoyingly, over 1,600 songs are
in multiple playlists with different genre assignments.
# Removing playlist information removes all duplicates.
spotify_songs %>%
select(-c(playlist_name, playlist_id,
playlist_genre, playlist_subgenre)) %>%
distinct() %>%
count(track_id) %>%
filter(n > 1)
#> # A tibble: 0 x 2
#> # … with 2 variables: track_id <chr>, n <int>
# Removing all playlist info except for genre does not remove all duplicates.
spotify_songs %>%
select(-c(playlist_name, playlist_id, playlist_subgenre)) %>%
distinct() %>%
count(track_id) %>%
filter(n > 1)
#> # A tibble: 1,687 x 2
#> track_id n
#> <chr> <int>
#> 1 00qOE7OjRl0BpYiCiweZB2 2
#> 2 00QyLmjxaSEE8qIZQjBXBj 3
#> 3 00WIXhVVhswHuS6dlkScuw 2
#> 4 01iyINEYgPQ7ThMZuHUsqS 2
#> 5 0240T0gP9w6xEgIciBrfVF 2
#> 6 02CygBCQOIyEuhNZqHHcNx 2
#> 7 02itaCXOdC54J0ISjqqFAp 2
#> 8 02M6vucOvmRfMxTXDUwRXu 3
#> 9 04aAxqtGp5pv12UXAg4pkq 2
#> 10 04KTF78FFg8sOHC1BADqbY 2
#> # … with 1,677 more rows
spotify_songs %>%
ggplot(aes(x = track_popularity)) +
geom_histogram(binwidth = 5, alpha = 0.5,
color = "darkcyan", fill = "darkcyan") +
scale_y_continuous(expand = expansion(mult = c(0, 0))) +
labs(x = "track popularity",
y = "count",
title = "Distribution of song popularity")
spotify_songs %>%
sample_n(200) %>%
select(danceability, energy, speechiness, acousticness,
instrumentalness, liveness, valence) %>%
ggpairs() +
theme_minimal(base_size = 6)
spotify_songs %>%
group_by(playlist_genre) %>%
summarise(n = n_distinct(track_id)) %>%
ungroup() %>%
ggplot(aes(x = playlist_genre, y = n)) +
geom_col(aes(fill = playlist_genre)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
theme(legend.position = "none") +
labs(x = "Genre", y = "count", title = "Number of songs per genre")
spotify_songs %>%
group_by(playlist_genre, playlist_subgenre) %>%
summarise(n = n_distinct(track_id)) %>%
ungroup() %>%
ggplot(aes(x = n, y = playlist_subgenre)) +
facet_wrap(~ playlist_genre, scales = "free") +
geom_col(aes(fill = playlist_genre)) +
scale_x_continuous(expand = expansion(mult = c(0, 0.05))) +
theme(legend.position = "none") +
labs(x = "count", y = "Subgrene", title = "Number of songs per subgenre")
spotify_songs %>%
filter(!is.na(track_album_release_date)) %>%
ggplot(aes(x = track_album_release_date, y = track_popularity)) +
geom_point(size = 0.5, alpha = 0.2, color = "dodgerblue") +
geom_density2d(color = "black", lty = 2, size = 1)
spotify_songs %>%
filter(!is.na(track_album_release_date)) %>%
mutate(track_release_day = yday(track_album_release_date)) %>%
ggplot(aes(x = track_release_day, y = track_popularity)) +
geom_point(size = 0.5, alpha = 0.2, color = "dodgerblue") +
geom_density2d(color = "black", lty = 2, size = 1)
spotify_songs %>%
select(track_id, danceability:tempo) %>%
skimr::skim()
Name | Piped data |
Number of rows | 32833 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 11 |
________________________ | |
Group variables | None |
Data summary
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
track_id | 0 | 1 | 22 | 22 | 0 | 28356 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
danceability | 0 | 1 | 0.65 | 0.15 | 0.00 | 0.56 | 0.67 | 0.76 | 0.98 | ▁▁▃▇▃ |
energy | 0 | 1 | 0.70 | 0.18 | 0.00 | 0.58 | 0.72 | 0.84 | 1.00 | ▁▁▅▇▇ |
key | 0 | 1 | 5.37 | 3.61 | 0.00 | 2.00 | 6.00 | 9.00 | 11.00 | ▇▂▅▅▆ |
loudness | 0 | 1 | -6.72 | 2.99 | -46.45 | -8.17 | -6.17 | -4.64 | 1.27 | ▁▁▁▂▇ |
mode | 0 | 1 | 0.57 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
speechiness | 0 | 1 | 0.11 | 0.10 | 0.00 | 0.04 | 0.06 | 0.13 | 0.92 | ▇▂▁▁▁ |
acousticness | 0 | 1 | 0.18 | 0.22 | 0.00 | 0.02 | 0.08 | 0.26 | 0.99 | ▇▂▁▁▁ |
instrumentalness | 0 | 1 | 0.08 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 0.99 | ▇▁▁▁▁ |
liveness | 0 | 1 | 0.19 | 0.15 | 0.00 | 0.09 | 0.13 | 0.25 | 1.00 | ▇▃▁▁▁ |
valence | 0 | 1 | 0.51 | 0.23 | 0.00 | 0.33 | 0.51 | 0.69 | 0.99 | ▃▇▇▇▃ |
tempo | 0 | 1 | 120.88 | 26.90 | 0.00 | 99.96 | 121.98 | 133.92 | 239.44 | ▁▂▇▂▁ |
spotify_songs_data <- spotify_songs %>%
select(track_id, danceability:tempo) %>%
distinct() %>%
as.data.frame() %>%
column_to_rownames("track_id")
song_pca <- prcomp(spotify_songs_data, scale = TRUE, center = TRUE)
summary(song_pca)
#> Importance of components:
#> PC1 PC2 PC3 PC4 PC5 PC6 PC7
#> Standard deviation 1.4726 1.2331 1.0855 1.04529 0.9883 0.98439 0.93482
#> Proportion of Variance 0.1971 0.1382 0.1071 0.09933 0.0888 0.08809 0.07944
#> Cumulative Proportion 0.1971 0.3354 0.4425 0.54182 0.6306 0.71871 0.79815
#> PC8 PC9 PC10 PC11
#> Standard deviation 0.89893 0.79241 0.74870 0.47308
#> Proportion of Variance 0.07346 0.05708 0.05096 0.02035
#> Cumulative Proportion 0.87161 0.92870 0.97965 1.00000
fviz_eig(song_pca)
tibble(
pc = c(0, seq(1, ncol(song_pca$x))),
cum_prop = c(0, cumsum(song_pca$sdev^2 / sum(song_pca$sdev^2)))
) %>%
mutate(label = paste0(round(cum_prop * 100), "%")) %>%
ggplot(aes(x = pc, y = cum_prop)) +
geom_hline(yintercept = 0, color = "grey70") +
geom_vline(xintercept = 0, color = "grey70") +
geom_line(linetype = 2) +
geom_point(size = 1.2) +
geom_text(aes(label = label), family = "Arial",
nudge_x = 0.2, nudge_y = -0.03) +
labs(x = "principal component",
y = "cumulative proportion")
fviz_pca_var(song_pca, axes = c(1, 2))
fviz_pca_var(song_pca, axes = c(1, 3))
songs_in_pca_mat <- unique(song_pca$x)
songs_in_pca <- songs_in_pca_mat %>%
as.data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("track_id") %>%
as_tibble() %>%
left_join(spotify_songs %>% select(track_id:track_popularity,
playlist_genre:playlist_subgenre),
by = "track_id") %>%
group_by(track_id) %>%
filter(n() == 1) %>%
ungroup()
songs_in_pca %>%
sample_n(3000) %>%
ggplot(aes(x = PC1, y = PC2, color = playlist_genre)) +
geom_point(size = 1, alpha = 0.5) +
labs(color = "genre",
title = "PCA of songs by their music characteristics",
subtitle = "Colored by genre")
songs_in_pca %>%
sample_n(3000) %>%
ggplot(aes(x = PC1, y = PC3, color = playlist_genre)) +
geom_point(size = 1, alpha = 0.5) +
labs(color = "genre",
title = "PCA of songs by their music characteristics",
subtitle = "Colored by genre")
idx <- sample(seq(1, nrow(songs_in_pca_mat)), 5e3)
songs_tsne <- Rtsne::Rtsne(songs_in_pca_mat[idx, ])
songs_tsne$Y %>%
as.data.frame() %>%
as_tibble() %>%
set_names(c("Z1", "Z2")) %>%
add_column(track_id = rownames(songs_in_pca_mat)[idx]) %>%
left_join(spotify_songs %>% select(track_id:track_popularity,
playlist_genre:playlist_subgenre),
by = "track_id") %>%
group_by(track_id) %>%
filter(n() == 1) %>%
ungroup() %>%
ggplot(aes(x = Z1, y = Z2, color = playlist_genre)) +
geom_point(size = 1, alpha = 0.5) +
labs(color = "genre", title = "t-SNE of PCA of song information")
I would like to get some practice with the ‘tidymodels’ framework, so I will use it to prepare the date (using ‘rsample’ and ‘recipes’), instantiate a nd train a random forest model (using ‘parsnip’), measure the success of the model (using ‘yardstick’), and tune the model’s hyperparameters (using ‘tune’).
For practicality reasons (as my goal is to practice using the ‘tidymodels’ framework, not replicate Spotify’s data science team), I will restrict the data two just predicting between 3 groups
library(tidymodels)
#> ── Attaching packages ─────────────────────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom 0.5.5 ✓ rsample 0.0.6
#> ✓ dials 0.0.6 ✓ tune 0.1.0
#> ✓ infer 0.5.1 ✓ workflows 0.1.1
#> ✓ parsnip 0.1.0 ✓ yardstick 0.0.6
#> ✓ recipes 0.1.10
set.seed(0)
spotify_data <- spotify_songs %>%
filter(playlist_genre %in% c("rock", "rap", "latin")) %>%
select(track_id, playlist_genre, danceability:tempo) %>%
distinct() %>%
group_by(track_id) %>%
filter(n() == 1) %>%
ungroup() %>%
group_by(playlist_genre) %>%
sample_frac(0.7) %>%
ungroup()
spotify_data_split <- initial_split(spotify_data,
prop = 0.8,
strata = playlist_genre)
spotify_data_train <- training(spotify_data_split)
spotify_data_test <- testing(spotify_data_split)
set.seed(0)
metric_pal <- randomcoloR::distinctColorPalette(11)
spotify_data_train %>%
select(-track_id) %>%
pivot_longer(-playlist_genre, names_to = "metric", values_to = "value") %>%
group_by(playlist_genre, metric) %>%
summarise(avg_value = mean(value)) %>%
group_by(metric) %>%
mutate(avg_value = scales::rescale(avg_value, to = c(0, 1))) %>%
ungroup() %>%
ggplot(aes(x = avg_value, y = metric)) +
facet_wrap(~ playlist_genre, nrow = 1) +
geom_col(aes(fill = metric)) +
scale_fill_manual(values = metric_pal, guide = NULL) +
theme(
panel.grid.major.y = element_blank()
) +
labs(x = "average value",
y = NULL,
title = "Values of song metrics across genres")
spotify_recipe <- recipe(playlist_genre ~ ., data = spotify_data_train) %>%
update_role(track_id, new_role = "ID") %>%
step_corr(all_predictors())
spotify_prep <- prep(spotify_recipe)
spotify_juiced <- juice(spotify_prep)
spotify_juiced
#> # A tibble: 7,917 x 13
#> track_id danceability energy key loudness mode speechiness acousticness
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 555ix1g… 0.566 0.666 5 -5.74 0 0.163 0.0161
#> 2 6HTJZ0T… 0.805 0.858 0 -3.07 1 0.105 0.194
#> 3 7H4XjkO… 0.695 0.755 7 -6.87 0 0.0386 0.134
#> 4 62E8JJR… 0.658 0.863 7 -3.58 0 0.0352 0.287
#> 5 2RruTqd… 0.802 0.611 8 -7.52 1 0.0601 0.356
#> 6 7GsMKEG… 0.667 0.867 11 -5.56 0 0.0466 0.046
#> 7 0gT1wHT… 0.589 0.838 5 -4.07 0 0.0457 0.00807
#> 8 4mNOezZ… 0.654 0.803 10 -5.14 1 0.0544 0.254
#> 9 1xh59Kp… 0.399 0.53 10 -7.18 0 0.0532 0.479
#> 10 5tXg9mq… 0.712 0.61 6 -9.33 0 0.0376 0.000892
#> # … with 7,907 more rows, and 5 more variables: instrumentalness <dbl>,
#> # liveness <dbl>, valence <dbl>, tempo <dbl>, playlist_genre <fct>
spotify_tune_spec <- rand_forest(
mtry = tune(),
trees = 500,
min_n = tune()
) %>%
set_mode("classification") %>%
set_engine("ranger")
spotify_workflow <- workflow() %>%
add_recipe(spotify_recipe) %>%
add_model(spotify_tune_spec)
set.seed(0)
spotify_training_folds <- vfold_cv(spotify_data_train)
spotify_training_folds
#> # 10-fold cross-validation
#> # A tibble: 10 x 2
#> splits id
#> <named list> <chr>
#> 1 <split [7.1K/792]> Fold01
#> 2 <split [7.1K/792]> Fold02
#> 3 <split [7.1K/792]> Fold03
#> 4 <split [7.1K/792]> Fold04
#> 5 <split [7.1K/792]> Fold05
#> 6 <split [7.1K/792]> Fold06
#> 7 <split [7.1K/792]> Fold07
#> 8 <split [7.1K/791]> Fold08
#> 9 <split [7.1K/791]> Fold09
#> 10 <split [7.1K/791]> Fold10
doParallel::registerDoParallel()
set.seed(0)
spotify_tune_1 <- tune_grid(
spotify_workflow,
resamples = spotify_training_folds,
grid = 20
)
#> i Creating pre-processing data to finalize unknown parameter: mtry
spotify_tune_1
#> # 10-fold cross-validation
#> # A tibble: 10 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [7.1K/792]> Fold01 <tibble [40 × 5]> <tibble [0 × 1]>
#> 2 <split [7.1K/792]> Fold02 <tibble [40 × 5]> <tibble [0 × 1]>
#> 3 <split [7.1K/792]> Fold03 <tibble [40 × 5]> <tibble [0 × 1]>
#> 4 <split [7.1K/792]> Fold04 <tibble [40 × 5]> <tibble [0 × 1]>
#> 5 <split [7.1K/792]> Fold05 <tibble [40 × 5]> <tibble [0 × 1]>
#> 6 <split [7.1K/792]> Fold06 <tibble [40 × 5]> <tibble [0 × 1]>
#> 7 <split [7.1K/792]> Fold07 <tibble [40 × 5]> <tibble [0 × 1]>
#> 8 <split [7.1K/791]> Fold08 <tibble [40 × 5]> <tibble [0 × 1]>
#> 9 <split [7.1K/791]> Fold09 <tibble [40 × 5]> <tibble [0 × 1]>
#> 10 <split [7.1K/791]> Fold10 <tibble [40 × 5]> <tibble [0 × 1]>
p1 <- spotify_tune_1 %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
ggplot(aes(x = mtry, y = min_n)) +
geom_point(aes(color = mean, size = mean)) +
scale_color_gradient2(low = "blue", high = "tomato", midpoint = 0.9) +
scale_size_continuous(guide = NULL) +
labs(title = "ROC AUC variation over the hyperparameter space")
p2 <- spotify_tune_1 %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mtry, min_n, mean) %>%
pivot_longer(-mean, names_to = "parameter", values_to = "value") %>%
ggplot(aes(x = value, y = mean)) +
facet_wrap(~ parameter, nrow = 1, scales = "free") +
geom_point(aes(color = parameter), size = 2) +
geom_smooth(aes(group = parameter),
method = "loess", formula = "y ~ x",
color = "grey30", lty = 2, se = FALSE) +
scale_color_brewer(type = "qual", palette = "Set2", guide = NULL)
p1 / p2
spotify_tune_grid <- grid_regular(
mtry(range = c(2, 6)),
min_n(range = c(5, 15)),
levels = 5
)
spotify_tune_grid
#> # A tibble: 25 x 2
#> mtry min_n
#> <int> <int>
#> 1 2 5
#> 2 3 5
#> 3 4 5
#> 4 5 5
#> 5 6 5
#> 6 2 7
#> 7 3 7
#> 8 4 7
#> 9 5 7
#> 10 6 7
#> # … with 15 more rows
set.seed(0)
spotify_tune_2 <- tune_grid(
spotify_workflow,
resamples = spotify_training_folds,
grid = spotify_tune_grid
)
#> i Creating pre-processing data to finalize unknown parameter: mtry
spotify_tune_2
#> # 10-fold cross-validation
#> # A tibble: 10 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [7.1K/792]> Fold01 <tibble [50 × 5]> <tibble [0 × 1]>
#> 2 <split [7.1K/792]> Fold02 <tibble [50 × 5]> <tibble [0 × 1]>
#> 3 <split [7.1K/792]> Fold03 <tibble [50 × 5]> <tibble [0 × 1]>
#> 4 <split [7.1K/792]> Fold04 <tibble [50 × 5]> <tibble [0 × 1]>
#> 5 <split [7.1K/792]> Fold05 <tibble [50 × 5]> <tibble [0 × 1]>
#> 6 <split [7.1K/792]> Fold06 <tibble [50 × 5]> <tibble [0 × 1]>
#> 7 <split [7.1K/792]> Fold07 <tibble [50 × 5]> <tibble [0 × 1]>
#> 8 <split [7.1K/791]> Fold08 <tibble [50 × 5]> <tibble [0 × 1]>
#> 9 <split [7.1K/791]> Fold09 <tibble [50 × 5]> <tibble [0 × 1]>
#> 10 <split [7.1K/791]> Fold10 <tibble [50 × 5]> <tibble [0 × 1]>
spotify_tune_2 %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
mutate(min_n = factor(min_n)) %>%
ggplot(aes(mtry, mean, color = min_n)) +
geom_line(alpha = 0.5, size = 1.5) +
geom_point() +
labs(y = "AUC")
best_auc <- select_best(spotify_tune_2, "roc_auc")
spotify_rf <- finalize_model(spotify_tune_spec, best_auc)
spotify_rf
#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = 2
#> trees = 500
#> min_n = 5
#>
#> Computational engine: ranger
spotify_rf %>%
set_engine("ranger", importance = "permutation") %>%
fit(playlist_genre ~ .,
data = select(spotify_juiced, -track_id)) %>%
vip(geom = "point")
spotify_workflow_final <- workflow() %>%
add_recipe(spotify_recipe) %>%
add_model(spotify_rf)
spotify_random_forest <- last_fit(spotify_workflow_final, spotify_data_split)
spotify_random_forest %>%
collect_metrics()
#> # A tibble: 2 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 accuracy multiclass 0.763
#> 2 roc_auc hand_till 0.905
spotify_random_forest %>%
collect_predictions() %>%
roc_curve(playlist_genre, .pred_latin:.pred_rock) %>%
autoplot()