Nicolas
April 26, 2016
The objective here is to explore the dataset to identify different groups of products that shares similarities and that can be treated in a similar way.
rm(list=ls(all=TRUE)) # clean up the memory of your current R session
data <- read.csv(file = 'DATA_2.01_SKU.csv')
str(data)
## 'data.frame': 100 obs. of 2 variables:
## $ ADS: int 1 3 1 2 9 2 2 8 11 6 ...
## $ CV : num 0.68 0.4 0.59 0.39 0.11 0.56 0.69 0.76 0.96 0.13 ...
summary(data)
## ADS CV
## Min. : 1.00 Min. :0.050
## 1st Qu.: 2.00 1st Qu.:0.130
## Median : 3.00 Median :0.400
## Mean : 5.61 Mean :0.396
## 3rd Qu.:10.00 3rd Qu.:0.590
## Max. :14.00 Max. :0.960
Let's plot our data to see if we can identify groups visually:
library(ggplot2)
ggplot(data=data, aes(x=CV,y=ADS)) +
geom_point() +
xlab("Coefficient of Variation") +
ylab("Average Daily Sales") +
ggtitle("SKU Example") +
geom_vline(xintercept = 0.2, color='red') +
geom_hline(yintercept = 4, color='red') +
annotate("text", x = 0.15, y = 9.7, label = "Horses", color='red') + #label the group "Horses"
annotate("text", x = 0.65, y = 9, label = "Wild Bulls", color='red') +
annotate("text", x = 0.8, y = 2, label = "Crickets", color='red')
Let's find groups using hierarchical clustering and check if we obtain similar results
testdata <- data # keeps our initial dataset safe
testdata <- scale(testdata) # normalizes the dataset (and transforms it to a matrix)
First, let's compute the distances of all the observations in our dataset
d <- dist(testdata, method = "euclidean") # distance matrix
and perform hiearchical clustering (Ward's method) and plot the dendogram
fit <- hclust(d, method="ward.D")
plot(fit) # plot dendrogram
Per the previous plot and the dendrogram, 3 clusters seems like a good choice:
data$groupID <- cutree(fit,k=3) # assign each point to one of our k=3 clusters
#rect.hclust(fit, k=3, border="red") # draw dendogram with red borders around the clusters
then finally plot the clusters
data$groupID <- as.factor(data$groupID) # set variable as factors
ggplot(data=data, aes(x=CV,y=ADS, color=groupID)) +
geom_point() +
xlab("Coefficient of Variation") +
ylab("Average Daily Sales") +
ggtitle("After clustering")
1/ What is the correct mean and median of the coefficient of variations of the sales in the SKU dataset?
mean(data$CV)
## [1] 0.396
median(data$CV)
## [1] 0.4
The mean is 0.396 and the median is 0.4.
2/ Do a hierarchical clustering on scaled data using an Euclidian distance and Ward.D clustering on the SKU dataset (DATA_2.01_SKU.csv). What are the resulting segments compared to what is shown in class if you decide to take only 2 clusters?
data$groupID <- cutree(fit,k=2)
# and plot:
data$groupID <- as.factor(data$groupID) # set variable as factors
ggplot(data=data, aes(x=CV,y=ADS, color=groupID)) +
geom_point() +
xlab("Coefficient of Variation") +
ylab("Average Daily Sales") +
ggtitle("After clustering")
With k=2 clusters, the segments "Crickets" and "Wild Bulls" are merged.
rm(list=ls(all=TRUE))
data <- read.csv(file = 'DATA_2.02_HR.csv')
str(data)
## 'data.frame': 2000 obs. of 6 variables:
## $ S : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ LPE : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ NP : int 2 5 7 5 2 2 6 5 5 2 ...
## $ ANH : int 157 262 272 223 159 153 247 259 224 142 ...
## $ TIC : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Newborn: int 0 0 0 0 0 0 0 0 0 0 ...
summary(data)
## S LPE NP ANH
## Min. :0.09 Min. :0.450 Min. :2.000 Min. :126.0
## 1st Qu.:0.11 1st Qu.:0.520 1st Qu.:2.000 1st Qu.:146.0
## Median :0.41 Median :0.790 Median :4.000 Median :225.0
## Mean :0.44 Mean :0.721 Mean :3.877 Mean :207.9
## 3rd Qu.:0.73 3rd Qu.:0.900 3rd Qu.:6.000 3rd Qu.:262.0
## Max. :0.92 Max. :1.000 Max. :7.000 Max. :310.0
## TIC Newborn
## Min. :2.000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:0.0000
## Median :4.000 Median :0.0000
## Mean :3.878 Mean :0.0525
## 3rd Qu.:5.000 3rd Qu.:0.0000
## Max. :6.000 Max. :1.0000
Normalizing the dataset:
testdata <- data # keeps our initial dataset safe
testdata <- scale(testdata) # normalizes the dataset (and transforms it to a matrix)
Then compute the distances of all the observations in our dataset:
d <- dist(testdata, method = "euclidean") # computes the distances of all the observations
And perform hiearchical clustering (Ward's method) and plot the dendogram:
fit <- hclust(d, method="ward.D")
plot(fit) # plot dendrogram
Per the dendrogram, 4 clusters seems like a good choice:
data$groups <- cutree(fit,k=4) # assign our points to our k=4 clusters
#rect.hclust(fit, k=4, border="red") # draw dendogram with red borders around the clusters
Then computes the average value of each variable for each cluster:
library(dplyr)
aggdata <- data %>% group_by(groups) %>% summarise_each(funs(mean))
aggdata
## Source: local data frame [4 x 7]
##
## groups S LPE NP ANH TIC Newborn
## (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 1 0.4082346 0.5106936 2.000000 143.5044 3.000000 0
## 2 2 0.7355272 0.8732588 4.476038 237.7732 4.811502 0
## 3 3 0.1028571 0.8715126 6.237395 276.7353 4.115546 0
## 4 4 0.4455238 0.7195238 3.780952 204.3524 3.866667 1
# or equivalently, using the aggregate function:
#aggdata <- aggregate(.~ groups, data=data, FUN=mean)
Computes the proportion of our data that is in each cluster:
data %>% group_by(groups) %>%
summarise( count = n()) %>%
mutate(freq = count / sum(count))
## Source: local data frame [4 x 3]
##
## groups count freq
## (int) (int) (dbl)
## 1 1 793 0.3965
## 2 2 626 0.3130
## 3 3 476 0.2380
## 4 4 105 0.0525
# or equivalently, using the aggregate function:
#proptemp <- aggregate(S ~ groups, data=data, FUN=length) # we create a variable called proptemp which computes the number of observations in each group (using the S variable, but you can take any.)
#aggdata$proportion <- (proptemp$S)/sum(proptemp$S) # proportion of observations in each group we compute the ratio between proptemp and the total number of observations
#aggdata <- aggdata[order(aggdata$proportion,decreasing=T),] # Let's order the groups from the larger to the smaller
As discussed in the videos, let's remove the Newborn variable, and try clustering again This variable is not really relevant and by being a dummy drives the clustering too much
testdata <- data[,1:5] # we create a new dataset, called "testdata", that includes the 5 first columns of our original dataset
We then rerun the code used above:
testdata <- scale(testdata) # normalizes our original variables
d <- dist(testdata, method = "euclidean") # computes distances between observations
fit <- hclust(d, method="ward.D") # Hierachical Clustering using Ward criterion
plot(fit)
Per the dendrogram, 4 clusters seems like a good choice:
data$groups <- cutree(fit,k=4) # assign our points to our k=4 clusters
#rect.hclust(fit, k=4, border="red") # draw dendogram with red borders around the clusters
Note that we re-use the original dataset "data" (where the variable Newborn is still present) and not "testdata" (where the variable Newborn has been removed) Hence we'll be able to produce summary statistics also for the Newborn variable regardless it wasn't included when doing the second version of the clustering
Then recomputes the average value of each variable for each cluster:
aggdata <- data %>% group_by(groups) %>% summarise_each(funs(mean))
aggdata
## Source: local data frame [4 x 7]
##
## groups S LPE NP ANH TIC Newborn
## (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 1 0.4083871 0.5105615 2.000000 143.5735 3.000000 0.05256870
## 2 2 0.8184082 0.9225306 4.604082 247.3918 5.206122 0.05102041
## 3 3 0.1033399 0.8718182 6.201581 276.3004 4.100791 0.04743083
## 4 4 0.5076647 0.7276048 4.107784 207.3473 3.706587 0.07185629
And recomputes the proportion of our data that is in each cluster:
data %>% group_by(groups) %>%
summarise( count = n()) %>%
mutate(freq = count / sum(count))
## Source: local data frame [4 x 3]
##
## groups count freq
## (int) (int) (dbl)
## 1 1 837 0.4185
## 2 2 490 0.2450
## 3 3 506 0.2530
## 4 4 167 0.0835
To export our results, we can run the following code
# write.csv(aggdata, "HR_example_Numerical_Output.csv", row.names=FALSE)
# This allows to import the data in Excel for instance where we can prepare it for the presentation, if needed
1/ Plot of the last project evaluation as a function of the number of projects done:
ggplot(data=data, aes(x=NP,y=LPE)) +
geom_point() +
xlab("Number of project") +
ylab("Last Project Evaluation")
2/ If you cluster the HR dataset on Satisfaction, Project Evaluation and Number of Projects Done and that you keep 2 segments using the same values for the other specifications (scaling, distance type and clustering algorithm), what's the resulting median Satisfaction per segment?
testdata <- data[,1:3] # keeping only Satisfaction, Project Evaluation and Number of Projects
testdata <- scale(testdata) # normalizes our original variables
d <- dist(testdata, method = "euclidean") # computes distances between observations
fit <- hclust(d, method="ward.D") # Hierachical Clustering using Ward criterion
plot(fit)
data$groups <- cutree(fit,k=2) # k=2 clusters
aggdata <- data %>% group_by(groups) %>% summarise_each(funs(median))
aggdata
## Source: local data frame [2 x 7]
##
## groups S LPE NP ANH TIC Newborn
## (int) (dbl) (dbl) (int) (int) (int) (int)
## 1 1 0.41 0.52 2 146 3 0
## 2 2 0.64 0.90 5 260 5 0
The median of Satisfation per segment is 0.41 for segment #1 and 0.64 for segment #2.
The objective here is to find segments of customers that use the services in a similar ways.
rm(list=ls(all=TRUE))
data <- read.csv(file = 'DATA_2.03_Telco.csv')
str(data)
## 'data.frame': 1000 obs. of 5 variables:
## $ Calls : num 1.12 1.08 3.54 1.09 1.04 5.59 1.19 3.07 5.34 2.43 ...
## $ Intern: num 0.19 0.22 0.26 0.21 0.24 0.88 0.19 0.34 0.99 0.09 ...
## $ Text : num 23.9 17.8 289.8 19.1 20.3 ...
## $ Data : num 0.18 0.23 1.99 0.21 0.2 2.02 0.24 2.06 0.97 0.46 ...
## $ Age : int 60 54 34 61 56 41 71 30 41 33 ...
summary(data)
## Calls Intern Text Data
## Min. :0.850 Min. :0.0900 Min. : 17.01 Min. :0.170
## 1st Qu.:1.490 1st Qu.:0.1100 1st Qu.: 20.77 1st Qu.:0.520
## Median :2.040 Median :0.2200 Median :190.53 Median :1.980
## Mean :2.630 Mean :0.4048 Mean :225.02 Mean :1.965
## 3rd Qu.:3.493 3rd Qu.:0.7900 3rd Qu.:449.44 3rd Qu.:2.902
## Max. :6.000 Max. :1.2000 Max. :598.94 Max. :4.790
## Age
## Min. :12.00
## 1st Qu.:24.00
## Median :35.00
## Mean :37.78
## 3rd Qu.:51.00
## Max. :72.00
Let's normalize our variables and compute the distance between variables:
testdata <- data # keeps our intial dataset safe
testdata <- scale(testdata) # normalizes all variables
d <- dist(testdata, method = "euclidean")
and perform hiearchical clustering (Ward's method) and plot the dendogram:
fit <- hclust(d, method="ward.D")
plot(fit)
Per the dendrogram, let's start with 8 clusters:
data$groups <- cutree(fit, k=8) # assign our points to our k=8 clusters
#rect.hclust(fit, k=8, border="red") # draw dendogram with red borders around the clusters
Then computes the average value of each variable for each cluster:
aggdata <- data %>% group_by(groups) %>% summarise_each(funs(mean))
aggdata
## Source: local data frame [8 x 6]
##
## groups Calls Intern Text Data Age
## (int) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 1 1.021134 0.2044845 20.87433 0.2088144 60.77835
## 2 2 3.090476 0.3082313 312.06755 2.0329932 30.74150
## 3 3 5.129716 1.0232955 21.32989 2.0599432 45.90909
## 4 4 3.540654 0.6533645 322.21607 2.1341121 40.08411
## 5 5 2.307582 0.1025275 20.57440 0.5201099 35.78022
## 6 6 1.550319 0.1013830 506.58840 3.0772340 14.31915
## 7 7 1.802025 0.1032278 513.06924 4.1231646 21.58861
## 8 8 1.679091 1.0509091 191.22182 1.4072727 32.84848
And computes the proportion of our data that is in each cluster:
data %>% group_by(groups) %>%
summarise( count = n()) %>%
mutate(freq = count / sum(count))
## Source: local data frame [8 x 3]
##
## groups count freq
## (int) (int) (dbl)
## 1 1 194 0.194
## 2 2 147 0.147
## 3 3 176 0.176
## 4 4 107 0.107
## 5 5 91 0.091
## 6 6 94 0.094
## 7 7 158 0.158
## 8 8 33 0.033
Let's try again with 5 segments
data$groups <- cutree(fit, k=5) # assign our points to our k=5 clusters
#rect.hclust(fit, k=5, border="red") # draw dendogram with red borders around the clusters
Then computes the average value of each variable for each cluster:
aggdata <- data %>% group_by(groups) %>% summarise_each(funs(mean))
aggdata
## Source: local data frame [5 x 6]
##
## groups Calls Intern Text Data Age
## (int) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 1 1.021134 0.2044845 20.87433 0.2088144 60.77835
## 2 2 3.096028 0.5222997 301.95599 1.9987456 34.46690
## 3 3 5.129716 1.0232955 21.32989 2.0599432 45.90909
## 4 4 2.307582 0.1025275 20.57440 0.5201099 35.78022
## 5 5 1.708135 0.1025397 510.65179 3.7330159 18.87698
And computes the proportion of our data that is in each cluster:
data %>% group_by(groups) %>%
summarise( count = n()) %>%
mutate(freq = count / sum(count))
## Source: local data frame [5 x 3]
##
## groups count freq
## (int) (int) (dbl)
## 1 1 194 0.194
## 2 2 287 0.287
## 3 3 176 0.176
## 4 4 91 0.091
## 5 5 252 0.252
To export our results, we can execute the following code
# write.csv(aggdata, file = "aggdataTelco5seg.csv", row.names=FALSE)
First, we need to reshape data from wide to long format:
library(tidyr)
aggdata.long <- aggdata %>% gather(key , value, -groups) # reshaping data from wide to long
aggdata.long
## Source: local data frame [25 x 3]
##
## groups key value
## (int) (chr) (dbl)
## 1 1 Calls 1.0211340
## 2 2 Calls 3.0960279
## 3 3 Calls 5.1297159
## 4 4 Calls 2.3075824
## 5 5 Calls 1.7081349
## 6 1 Intern 0.2044845
## 7 2 Intern 0.5222997
## 8 3 Intern 1.0232955
## 9 4 Intern 0.1025275
## 10 5 Intern 0.1025397
## .. ... ... ...
library(ggplot2)
ggplot(data=aggdata.long, aes(x=groups,y=value,fill=key)) +
geom_bar(stat="identity",position="dodge")
# Draw the radar chart with the function stars()
#palette(rainbow(12, s = 0.6, v = 0.75)) # Select the colors to use
#stars(aggdata[,2:(ncol(data))], len = 0.6, key.loc = c(11, 6),xlim=c(2,12),main = "Segments", draw.segments = TRUE,nrow = 2, cex = .75,labels=aggdata$groups)