-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3 - Exploratory Analysis.R
110 lines (92 loc) · 3.21 KB
/
3 - Exploratory Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#Libraries for Data Wrangling & Visualisation
library(tidyverse)
library(magrittr)
library(dplyr)
library(ggplot2)
library(knitr)
library(GGally)
library(httr)
library(jsonlite)
library(tidyr)
library(zoo)
library(RCurl)
#Other miscellaneous libraries
library(gt)
library(skimr)
library(ggdist)
library(showtext)
#Retrieving environment from RDS file
#Continuation after data wrangling
load(file = "D:/R Studio/Downloads/Data/For Project/Spotify Database/API Data/hiphop_main.rds")
#Exploratory Analysis
summary(songdf)
#Using ggpairs to check for correlations. Since the number of variables in
#the data frame is very huge, it is difficult to understand anything from
#ggpairs. Hence a correlation matrix shall be used next.
ggpairs(dplyr::select(songdf, -c('duration', 'track', 'release', 'album')))
#Building a correlation matrix to check for all correlations
#These libraries are needed for creating the matrix
library(purrr)
library(Hmisc)
songdf_cor = Hmisc::rcorr(as.matrix(dplyr::select(
songdf, -c('track', 'release',
'releaseyr',
'album')
)))
#The matrix returns other values as well, apart from just the correlation
#coefficients, and hence, only the correlation coefficients need to extracted
#and stored separately.
#Rounding off the correlation matrix to 3 decimals and storing it in a dataset
songdf_cor_r = data.frame(songdf_cor$r) %>% round(2)
str(songdf_cor_r)
songdf_cor_r$popularity
#Plotting correlation matrix using ggcorrplot. It is difficult to judge any
#kind of correlation from the correlation matrix plot, since there are very
#weak correlations in the data set. Hence this cannot be used in the report.
library(ggcorrplot)
ggcorrplot(
songdf_cor_r,
hc.order = TRUE,
type = "lower",
lab = TRUE,
outline.col = "black",
ggtheme = ggplot2::theme_gray,
colors = c("black", "white", "#013220")
)
#Visually exploring correlations
#Age and Popularity
ggplot(data = songdf, aes(x = age, y = popularity)) +
geom_point() +
stat_smooth(method = "lm", col = "#1DB954") +
xlab("Age of Songs (In Years)") +
ylab("Song Popularity") +
ggtitle("Relationship Between Ages of Songs and Popularity") +
theme(plot.title = element_text(hjust = 0.5))
#scale_color_brewer(palette="Greens")
#Explicit and Popularity
ggplot(data = songdf, aes(x = explicit, y = popularity)) +
geom_boxplot(fill = "#1DB954",
colour = "black",
width = 0.6) +
stat_summary(
geom = "text",
fun = quantile,
aes(label = sprintf("%1.1f", ..y..)),
position = position_nudge(x = 0.40),
size = 3.5
) +
xlab("Explicit Lyrics") +
ylab("Song Popularity") +
ggtitle("Relationship Between Explicit Lyrics and Popularity") +
theme(plot.title = element_text(hjust = 0.5))
#Speechiness and Popularity
ggplot(data = songdf, aes(x = speechiness, y = popularity)) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
xlab("Speechiness") +
ylab("Song Popularity") +
ggtitle("Relationship Between Speechiness and Popularity") +
theme(plot.title = element_text(hjust = 0.5))
#Saving the environment
save.image(file = "D:/R Studio/Downloads/Data/For Project/Spotify Database/API Data/hiphop_main.rds")
#Continue to script 4 for Linear and Hierarchical Linear Modelling