-
Notifications
You must be signed in to change notification settings - Fork 0
/
mits.go
128 lines (117 loc) · 3.38 KB
/
mits.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
Package mits implements simple text segmentation by calculate words' mutual information value
*/
package mits
import (
"math"
)
// Terms contains result of MITS which means "term in string => probability of each term"
type Terms map[string]float64
// Mits read sentences and translate them into pieces of single char and double chars
// use
func Mits() {
}
// prob returns probability of single word in all documents/sentence
func prob() {
}
// CountSingleWord count frequency of each single rune word appear
// in single file
func CountSingleWord(sentence []rune) (single map[string]int, count int) {
single = make(map[string]int)
count = len(sentence)
for _, word := range sentence {
single[string(word)] += 1
}
return
}
// CountTwinWord
func CountTwinWord(sentence []rune) (twin map[string]int, count int) {
twin = make(map[string]int)
count = len(sentence) - 1
for i, _ := range sentence {
if i == len(sentence)-1 {
break // last word of sentence
}
two := sentence[i : i+2]
twin[string(two)] += 1
}
return
}
// CountTermFreq count frequency of single word and twin-word term
func CountTermFreq(sentence []rune) (single, twin map[string]int, s_len, t_len int) {
single = make(map[string]int)
twin = make(map[string]int)
s_len = len(sentence)
for i, word := range sentence {
single[string(word)] += 1
if i < len(sentence)-1 {
two := sentence[i : i+2]
twin[string(two)] += 1
t_len += 1
}
}
return
}
// CalcMI use single word frequency and twin word term frequency to
// calculate mutual information value of continue term
// mutual information value formula:
// MI(x[i], x[i+1]) = p(x[i], x[i+1]) * log(p(x[i], x[i+1]) / (p(x[i]) * p(x[i+1])))
func CalcMI(single, twin map[string]int, word string) (mi float64) {
return
}
// Segment sentence and return segmented terms by mutual information
// threshold
func Segment(sentence []rune, theta float64) (terms []string) {
terms = make([]string, 0)
single, twin, s_len, t_len := CountTermFreq(sentence)
var term = string(sentence[0])
for i := 1; i <= len(sentence)-1; i++ {
x2 := string(sentence[i-1 : i+1])
posofx2 := float64(twin[x2]) / float64(t_len)
posofxi := float64(single[string(sentence[i-1])]) / float64(s_len)
posofxj := float64(single[string(sentence[i])]) / float64(s_len)
mi := posofx2 * math.Log(posofx2/(posofxi*posofxj))
if mi < theta {
terms = append(terms, term)
term = string(sentence[i])
} else {
term += string(sentence[i])
}
}
return
}
// SegmentSens segment sentences
func SegmentSens(sentences [][]rune, theta float64) (terms []string) {
terms = make([]string, 0)
single := make(map[string]int)
twin := make(map[string]int)
s_len, t_len := 0, 0
for _, sentence := range sentences {
ts, tt, ss, st := CountTermFreq(sentence)
s_len += ss
t_len += st
for i, c := range ts {
single[i] += c
}
for i, c := range tt {
twin[i] += c
}
}
for _, sentence := range sentences {
var term = string(sentence[0])
for i := 1; i <= len(sentence)-1; i++ {
x2 := string(sentence[i-1 : i+1])
posofx2 := float64(twin[x2]) / float64(t_len)
posofxi := float64(single[string(sentence[i-1])]) / float64(s_len)
posofxj := float64(single[string(sentence[i])]) / float64(s_len)
mi := posofx2 * math.Log(posofx2/(posofxi*posofxj))
if mi < theta {
terms = append(terms, term)
term = string(sentence[i])
} else {
term += string(sentence[i])
}
}
}
return
}