-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnbc.jl
139 lines (127 loc) · 3.43 KB
/
nbc.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
function nbc(X_train, y_train, X_test, n=5)
accuracy = nbc_cv(X_train, y_train, n)
nbc_predict(X_train, y_train, X_test), accuracy
end
function nbc_cv(X_train::Matrix, y_train::Vector, n)
println("...........................validating nbc.............................")
_, accuracy = cv_helper(X_train, y_train, nbc_train, nothing, [0], n, "nbc", 1 / n)
println("nbc accuracy: ", accuracy)
println(".......................................................................")
accuracy
end
function nbc_train(X_train, y_train, X_test, args)
nbc_predict(X_train, y_train, X_test)
end
function nbc_predict(X_train, y_train, X_test)
labels = levels(y_train)
priors = compute_priors(y_train)
cat_map = cat_nbc(X_train, y_train)
gau_map = gau_nbc(X_train, y_train)
is_cat_list = check_cat(X_train)
res = []
for i in axes(X_test)[1]
prob_per_label = []
for l in labels
tmp = priors[l]
for j in eachindex(X_test[i, :])
if is_cat_list[j] == 1
if haskey(cat_map[l][j].map, X_test[i, j])
tmp *= cat_map[l][j].map[X_test[i, j]]
end
else
tmp *= compute_gaussian(X_test[i, j], gau_map[y_train[i]], j)
end
end
push!(prob_per_label, tmp)
end
push!(res, labels[argmax(prob_per_label)])
end
res
end
function compute_priors(y::Vector)
dict_helper(y)
end
struct Cat_Info
map::Dict
end
function cat_nbc(x::Matrix, y::Vector)
res = Dict()
for i in eachindex(y)
if !haskey(res, y[i])
res[y[i]] = []
for j in axes(x)[2]
push!(res[y[i]], Cat_Info(dict_helper(x[y .== y[i], j], true)))
end
end
end
res
end
function dict_helper(vec::Vector, regularize=false)
dict = Dict()
for e in vec
if !isnan(e)
dict[e] = haskey(dict, e) ? dict[e] + 1 : 1
end
end
res = Dict()
for (k, v) in dict
res[k] = regularize ? v / length(vec) : (v + 1) / (length(vec) + length(dict))
end
res
end
struct Gaussian_Info
mean::Vector{Float64}
std::Vector{Float64}
end
function gau_nbc(x::Matrix, y::Vector)
dict = Dict()
for i in eachindex(y)
if !isnan(y[i])
if haskey(dict, y[i])
push!(dict[y[i]], i)
else
dict[y[i]] = [i]
end
end
end
res = Dict()
for (k, v) in dict
tmp = []
for _ in axes(x)[2]
push!(tmp, [])
end
for e in v
for j in axes(x)[2]
if !isnan(x[e, j])
push!(tmp[j], x[e, j])
end
end
end
means = []
stds = []
for e in tmp
push!(means, mean(e))
push!(stds, std(e))
end
res[k] = Gaussian_Info(means, stds)
end
res
end
function compute_gaussian(x, info::Gaussian_Info, label)
erfc((x-info.mean[label])/info.std[label])
end
function check_cat(X_train)
res = Dict()
for j in axes(X_train)[2]
tmp = Dict()
counter = 0
for i in eachindex(X_train[:, j])
if !haskey(tmp, X_train[i, j])
tmp[X_train[i, j]] = 0
counter += 1
end
end
res[j] = counter < 10 ? 1 : 0
end
res
end