forked from ryanbressler/CloudForest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
importance_test.go
111 lines (91 loc) · 2.67 KB
/
importance_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package CloudForest
import (
"strings"
"testing"
)
func TestImportance(t *testing.T) {
if testing.Short() {
t.Skip("Skipping importance test on boston data set.")
}
boston := strings.NewReader(boston_housing)
fm := ParseARFF(boston)
if len(fm.Data) != 14 {
t.Errorf("Boston feature matrix has %v features not 14", len(fm.Data))
}
// add artifical contrasts
fm.ContrastAll()
targeti := fm.Map["class"]
candidates := make([]int, 0, 0)
for i := 0; i < len(fm.Data); i++ {
if i != targeti {
candidates = append(candidates, i)
}
}
numtarget := fm.Data[targeti]
nTrees := 20
//Brieman's importance definition
imp := func(mean float64, count float64) float64 {
return mean * float64(count) / float64(nTrees)
}
//standard
imppnt := NewRunningMeans(len(fm.Data))
_ = GrowRandomForest(fm, numtarget.(Feature), candidates, fm.Data[0].Length(), 6, nTrees, 1, 0, false, false, false, false, imppnt)
//TODO read importance scores and verify RM and LSTAT come out on top
roomimp := imp((*imppnt)[fm.Map["RM"]].Read())
lstatimp := imp((*imppnt)[fm.Map["LSTAT"]].Read())
beatlstat := 0
beatroom := 0
for _, rm := range *imppnt {
fimp := imp(rm.Read())
if fimp > roomimp {
beatroom++
}
if fimp > lstatimp {
beatlstat++
}
}
if beatroom > 1 || beatlstat > 1 {
t.Error("RM and LSTAT features not most important in boston data set regression.")
}
//vetting
imppnt = NewRunningMeans(len(fm.Data))
_ = GrowRandomForest(fm, numtarget.(Feature), candidates, fm.Data[0].Length(), 6, nTrees, 1, 0, false, false, true, false, imppnt)
//TODO read importance scores and verify RM and LSTAT come out on top
roomimp = imp((*imppnt)[fm.Map["RM"]].Read())
lstatimp = imp((*imppnt)[fm.Map["LSTAT"]].Read())
beatlstat = 0
beatroom = 0
for _, rm := range *imppnt {
fimp := imp(rm.Read())
if fimp > roomimp {
beatroom++
}
if fimp > lstatimp {
beatlstat++
}
}
if beatroom > 1 || beatlstat > 1 {
t.Error("RM and LSTAT features not most important in vetted boston data set regression.")
}
//evaloob
//vetting
imppnt = NewRunningMeans(len(fm.Data))
_ = GrowRandomForest(fm, numtarget.(Feature), candidates, fm.Data[0].Length(), 6, nTrees, 1, 0, false, false, false, true, imppnt)
//TODO read importance scores and verify RM and LSTAT come out on top
roomimp = imp((*imppnt)[fm.Map["RM"]].Read())
lstatimp = imp((*imppnt)[fm.Map["LSTAT"]].Read())
beatlstat = 0
beatroom = 0
for _, rm := range *imppnt {
fimp := imp(rm.Read())
if fimp > roomimp {
beatroom++
}
if fimp > lstatimp {
beatlstat++
}
}
if beatroom > 1 || beatlstat > 1 {
t.Error("RM and LSTAT features not most important in boston data set regression with eval oob.")
}
}