-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML1_Ex4.R
116 lines (94 loc) · 4.24 KB
/
ML1_Ex4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
################################
### Exercise 1 EM clustering ###
################################
library(EMCluster)
head(USArrests)
par(mfrow=c(1,2))
# Run a K-means clustering with 4 clusters and plot the clusters for first 2 principal comonents
km.out <- kmeans(USArrests, 4)
pr.out <- prcomp(USArrests,scale=TRUE)
plot(pr.out$x[,1:2], type="n", asp=1)
text(pr.out$x[,1:2], labels=state.abb, col=km.out$cluster)
# The approach in the EMCluster package is to set up an initial solution using init.EM and then run the
# EM algorithm to get the optimal solution using the emcluster function. In the third step we obtain the
# probabilities of the optimal solution by running one more e.step.
emobj <- init.EM(USArrests, nclass = 4)
emclobj <- emcluster(USArrests, emobj, assign.class = TRUE)
emprobs <- round(e.step(USArrests, emobj = emclobj)$Gamma,3)
# When the data have more than two variables, the default plot for an EM-object (with class emret) is a
# parallel coordinates plot: This is rarely helpful.
plotem(emobj,USArrests,lwd=2)
# Instead plot the 1st 2 principal components again and compare this with the K-Means plot.
plot(pr.out$x[,1:2],type="n", asp=1)
text(pr.out$x[,1:2],labels=state.abb,col=emclobj$class)
par(mfrow=c(1,1))
# We can look at each K-means cluster in more detail by returning the EM-algorithm probabilities.
round(emprobs[km.out$cluster==1,],3)
round(emprobs[km.out$cluster==2,],3)
round(emprobs[km.out$cluster==3,],3)
round(emprobs[km.out$cluster==4,],3)
df <- data.frame(emprobs)
df[,1]+df[,2]+df[,3]+df[,4] # probabilities sum up to 1
##############################
### Exercise 2 PIOMAS data ###
##############################
setwd("~/git-reps/machine-learning/")
##data
PIOMAS<-read.table("data/PIOMAS.txt",header=FALSE,row.names=1)
names(PIOMAS)<-month.abb
dim(PIOMAS)
PIOMAS<-PIOMAS[-40,] # removes 2018
year<-as.numeric(row.names(PIOMAS))
matplot(t(PIOMAS),type="l",xlab="Month",ylab="Sea Ice Volume", main="PIOMAS measures for each year") # One line for each year
# plot the mean values for each year
yave<-apply(PIOMAS, 1, mean)
plot(year, yave, ylab="Sea Ice Volume")
# K-means clustering
km.out <- kmeans(PIOMAS, centers=5)
plot(year, yave, ylab="Sea Ice Volume", col=km.out$cluster)
matplot(t(PIOMAS), type="l", xlab="Month", ylab="Sea Ice Volume", col=km.out$cluster)
# Scatter Plot & Correlation
library(psych)
pairs.panels(PIOMAS[,1:4] # just Jan-Apr
, gap=0
, pch=21)
# PCA
pc <- prcomp(PIOMAS, center=TRUE) # TODO: investigate center=TRUE
summary(pc)
# Bi-Plot
library(ggbiplot)
g <- ggbiplot(pc,
obs.scale = 1,
var.scale = 1,
groups = km.out$cluster,
ellipse = TRUE,
circle = TRUE,
ellipse.prob = 0.68) # this % of data (variability) will be captured by ellipses
g <- g + scale_color_continuous(name = "")
g <- g + theme(legend.direction = "horizontal",
legend.position = "top")
print(g)
# What do you notice about the variance in the first few PCs?
# PC1 and PC2 capture > 99% of the variance. PC1 alone captures 98%.
# There is a very high correlation between all the variables.
# IMPORTANT:
# All arrows in the bi-plot point in the same directions and have rouglhy the same length
# => the yearly mean vector can be used (more or less) euqivalent to the measures for each month per year
# These data form a time series and are very highly autocorrelated, the value of one value is
# closely related to the value of the previous one. Most of this autocorrelation can be removed
# from the data by considering the month on month differences in the sea ice volume. Notice that
# the first variable in each row is the difference February minus January, hence the odd variable names.
#read in the original data and convert to a vector
tt<-c(t(read.table("data/PIOMAS.txt",header=FALSE,row.names=1)))
tt[1:24]
#calculate the differences for the first 39 years of data
tt<-diff(tt[1:(39*12+1)]) #lagged diffs (compared to previous value)
tt[1:24]
#put back into data frame format
PIOdiff<-as.data.frame(matrix(tt,byrow=T,ncol=12))
#variable names
names(PIOdiff)<-month.abb[c(2:12,1)]
#row names
row.names(PIOdiff)<-row.names(PIOMAS)
matplot(t(PIOdiff),type="l")
# Large negative values imply a large amount of sea ice melting in that month.