-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrf.jl
135 lines (126 loc) · 4.54 KB
/
rf.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
function rf(X_train::Matrix, y_train::Vector, X_test::Matrix, n=5)
opt_depth, opt_reg, accuracy = rf_cv(X_train, y_train, n)
rf_tree = rf_helper(X_train, y_train, 0, opt_depth, opt_reg)
res = []
for i in axes(X_test)[1]
tmp = predict(rf_tree, X_test[i, :])
push!(res, tmp)
end
res, accuracy
end
function rf_cv(X_train::Matrix, y_train::Vector, n)
depths = trunc(Int, length(X_train[1, :])/2):trunc(Int, length(X_train[1, :])*3/20):length(X_train[1, :])*2
regularizations = [0.01 0.03 0.05 0.08 0.1 0.3 0.5 0.7 0.9 1.0 1.2 1.4 1.8]
println("...........................training rf depth...........................")
depth, _ = cv_helper(X_train, y_train, rf_train, nothing, depths, n, "depth", 1 / n)
println("opt_depth: ", depth)
println(".......................................................................")
println(".....................training rf regularization........................")
reg, accuracy = cv_helper(X_train, y_train, rf_train, depth, regularizations, n, "reg")
println("opt_regularization: ", reg, " with accuracy: ", accuracy)
println(".......................................................................")
depth, reg, accuracy
end
function rf_train(X_train::Matrix, y_train::Vector, X_test::Matrix, arg)
max_depth = arg[1]
regularization = 0.0
if isnothing(max_depth)
max_depth = arg[2]
else
regularization = arg[2]
end
rf_tree = rf_helper(X_train, y_train, 0, trunc(Int, max_depth), regularization)
res = []
for i in axes(X_test)[1]
tmp = predict(rf_tree, X_test[i, :])
push!(res, tmp)
end
res
end
function rf_helper(X::Matrix, y::Vector, depth::Int64, max_depth::Int64, regularization::Float64)
if length(y) == 0
return nothing
end
if depth > max_depth || length(y) <= 10
dict = Dict()
pred = nothing
tmp = 0
for e in y
dict[e] = haskey(dict, e) ? dict[e] + 1 : 1
if dict[e] > tmp
tmp = dict[e]
pred = e
end
end
return Tree(nothing, nothing, pred, nothing, nothing, nothing)
end
num_features = length(X[1, :])
min_entropy = typemax(Int)
feature = 1
value = 0
is_cat = 0
for f in shuffle(1:num_features)
tmp_value, is_cat, tmp_entropy = rf_step(X[:, f], y)
if tmp_entropy + regularization < min_entropy
min_entropy = tmp_entropy
value = tmp_value
feature = f
end
end
indices = is_cat == 1 ? X[:, feature] .== value : X[:, feature] .>= value
Tree(value, feature, nothing, is_cat, rf_helper(X[indices, :], y[indices], depth + 1, max_depth, regularization), rf_helper(X[(!).(indices), :], y[(!).(indices)], depth + 1, max_depth, regularization))
end
function rf_step(x::Vector, y::Vector)
return length(levels(x)) > 10 ? rf_step_helper(sort(x), x, y, 0) : rf_step_helper(levels(x), x, y, 1)
end
function rf_step_helper(vec::Vector, x::Vector, y::Vector, is_cat::Int64)
min_entropy = typemax(Int)
value = vec[1]
for e in vec
func(x) = isnan(e) ? e.===x : (is_cat == 1 ? e.==x : e.>=x);
tmp = entropy(y[func(x)]) + entropy(y[(!).(func(x))])
if tmp < min_entropy
min_entropy = tmp
value = e
end
end
value, is_cat, min_entropy
end
struct Tree
value::Union{Float64, Nothing}
feature::Union{Int64, Nothing}
pred::Union{Int64, Nothing}
is_cat::Union{Int64, Nothing}
left::Union{Tree, Nothing}
right::Union{Tree, Nothing}
end
function predict(t::Tree, x::Vector)
if !isnothing(t.pred)
return t.pred
end
if isnan(t.value)
return x[t.feature] === t.value ? (isnothing(t.left) ? predict(t.right, x) : predict(t.left, x)) : (isnothing(t.right) ? predict(t.left, x) : predict(t.right, x))
end
if t.is_cat == 1
return x[t.feature] == t.value ? (isnothing(t.left) ? predict(t.right, x) : predict(t.left, x)) : (isnothing(t.right) ? predict(t.left, x) : predict(t.right, x))
end
x[t.feature] >= t.value ? (isnothing(t.left) ? predict(t.right, x) : predict(t.left, x)) : (isnothing(t.right) ? predict(t.left, x) : predict(t.right, x))
end
function entropy(pred_y::Vector)
if length(pred_y) == 0
return 1
end
dict = Dict()
for e in pred_y
if haskey(dict, e)
dict[e] += 1
else
dict[e] = 1
end
end
res = 0
for (_, v) in dict
res -= v / length(pred_y) * log(v / length(pred_y))
end
res
end