-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGapStatistic.R
executable file
·152 lines (141 loc) · 4.08 KB
/
GapStatistic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
##########################################################
# generates uniformly distributed sample data
# data = actual data (used to match length and dimensions)
##########################################################
getSampleData <- function(data){
# number of rows in actual dataset
n <- nrow(data)
attrs <- list()
# length(data) is number of attributes (dimensions)
for(i in 1:length(data)) {
# minimum value at current attribute
minval <- min(data[,i])
# maximum value at current attribute
maxval <- max(data[,i])
# save random values from min to max
attrs[[i]] <- runif(n, minval, maxval)
}
# combine all attributes
sampledata <- data.frame(attrs)
return(sampledata)
}
##################################################################
# get standard deviation within a cluster
# standard deviation calculation is only used for sample datasets
# to calculate SD:
# subtract mean from log(Wk) and square value for each dataset
# sum these values
# multiply by 1 / B (number of datasets)
# take the square root
# log_Wk_list = list of log(Wk) values for data
# mean = mean of log(Wk) values
# k = current k value
##################################################################
getSD <- function(log_Wk_list,mean,k){
sd_sum <- 0
# number of datasets
B <- length(log_Wk_list)
for(i in 1:B){
sd_sum <- sd_sum + (log_Wk_list[[B]]-mean)^2
}
return( ((1/B)*sd_sum)^.5 )
}
############################################
# returns Wk for a given number of clusters
# Wk is a variance quantity
# Wk = sum(D_k / 2n_k)
# data = data (actual or sample)
# k = number of clusters
############################################
getWk <- function(data,k){
# get kmeans data
clustering <- k_means(data, k)
# number of rows in data
n <- nrow(data)
Wk <- 0
# for each cluster
for(i in 1:k) {
# indeces of objects in cluster k
l <- which (clustering$clusterID %in% i)
# get centroid
centroid <- clustering$finalCent[i,]
# distance sum
dk <- 0
# for each object in k
for(j in l){
point <- data[j,]
dk <- dk + dist(rbind(point,centroid),method="euclidean")
}
Wk <- Wk + dk
}
return(Wk)
}
#############################################
# Gap Statistic method for finding best k
# in k-means.
# dat - data to analyze
# max_k - maximum number of clusters possible
# B - number of sample sets to average
#############################################
gapStatistic <- function(dat, max_k=10, B=10) {
# normalizes data
dat <- normalize(dat)
# Create sample sets
sampleList <- list()
for(i in 1:B){
sampleList[[i]] <- getSampleData(dat)
}
# Gap Statistic algorithm
Wk_list <- c()
# start with k = 1, test to k = max_k
k <- 1
repeat {
# Wk for sample data
log_Wk_sample_list <- c()
# sum of log(Wk) values
sum <- 0
# for each sample dataset
for(i in 1:B){
# get Wk for sample dataset
Wk <- getWk(sampleList[[i]],k)
# take log of Wk
log_Wk <- log2(Wk)
# add log_Wk to list
log_Wk_sample_list[[i]]<-log_Wk
# add log_Wk to sum
sum <- sum + log_Wk
}
# mean of log(Wk) values
log_Wk_sample <- sum / B
# Wk for actual data
log_Wk_data <- log2(getWk(dat,k))
# used for plotting
Wk_list<-rbind(Wk_list,c(k,log_Wk_data,log_Wk_sample))
# find gap value
gap <- log_Wk_sample - log_Wk_data
# find standard deviation
sd <- getSD(log_Wk_sample_list, log_Wk_sample, k)
# find sk value
sk <- ((1 + (1/B))^0.5) * sd
# check if k is too large
if(k > max_k) {
print("Sorry, maxed out on k.")
break
}
# k is optimal when Gap(k) >= Gap(k + 1) - sk
if (k != 1 && prev_gap >= (gap - sk)) {
# optimal k is at k - 1
k <- k - 1
# plot Wk data
df <- data.frame("x"=Wk_list[,1],"Wk"=Wk_list[,2],"Wk_sample"=Wk_list[,3])
plot(df[1:2], col = "blue", pch = 20, type="o") #data
points(df[3], col = "red", pch = 20, type="o") #sample
break
}
prev_gap <- gap
# increment k
k <- k + 1
}
# return the optimal k value
return(k)
}