-
Notifications
You must be signed in to change notification settings - Fork 5
/
simhasher.go
146 lines (130 loc) · 3.7 KB
/
simhasher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
package gosimhash
import (
"strconv"
"fmt"
"github.com/HaoyuHu/gosimhash/utils"
jieba "github.com/yanyiwu/gojieba"
)
const (
BitsLength = 64
Binary = 2
DefaultHashKey = "8b6555d0c9cff7a9"
DefaultThresholdDist = 3
)
type Simhasher struct {
extractor *jieba.Jieba
hasher utils.Hasher
}
type HashWeight struct {
hash uint64
weight float64
}
func NewSimpleSimhasher() *Simhasher {
var jenkinsHasher utils.Hasher = utils.NewJenkinsHasher()
return NewSimhasher(jenkinsHasher, "", "", "", "", "")
}
func NewSimhasher(hasher utils.Hasher, dict string, hmm string, userDict string, idf string, stopWords string) *Simhasher {
getDictPath(&dict, &hmm, &userDict, &idf, &stopWords)
return &Simhasher{
extractor: jieba.NewJieba(dict, hmm, userDict, idf, stopWords),
hasher: hasher}
}
func (simhasher *Simhasher) MakeSimhash(doc *string, topN int) uint64 {
wws := simhasher.extractor.ExtractWithWeight(*doc, topN)
size := len(wws)
hws := make([]HashWeight, size, size)
simhasher.convertWordWeights2HashWeights(wws, hws)
var vector [BitsLength]float64
var one uint64 = 1
for _, hw := range hws {
for i := 0; i < BitsLength; i++ {
if ((one << uint(i)) & hw.hash) > 0 {
vector[i] += hw.weight
} else {
vector[i] += -hw.weight
}
}
}
var ret uint64 = 0
for i := 0; i < BitsLength; i++ {
if vector[i] > 0.0 {
ret |= one << uint(i)
}
}
return ret
}
func (simhasher *Simhasher) MakeSimhashBinString(doc *string, topN int) string {
simhash := simhasher.MakeSimhash(doc, topN)
return strconv.FormatUint(simhash, Binary)
}
func (simhasher *Simhasher) Free() {
simhasher.extractor.Free()
simhasher.hasher = nil
}
func CalculateDistanceBySimhash(simhash uint64, another uint64) int {
xor := simhash ^ another
counter := 0
for ; xor != 0; {
xor &= xor - 1
counter ++
}
return counter
}
func IsSimhashDuplicated(simhash uint64, another uint64, limit int) bool {
xor := simhash ^ another
counter := 0
for ; xor != 0 && counter <= limit; {
xor &= xor - 1
counter ++
}
return counter <= limit
}
func CalculateDistanceBySimhashBinString(simhashStr string, anotherStr string) (int, error) {
simhash, err := strconv.ParseUint(simhashStr, Binary, BitsLength)
if err != nil {
fmt.Printf("Cannot convert simHashStr(%s) to uint64 simhash: %s\n", simhashStr, err.Error())
return 0, err
}
another, err := strconv.ParseUint(anotherStr, Binary, BitsLength)
if err != nil {
fmt.Printf("Cannot convert anotherStr(%s) to uint64 simhash: %s\n", anotherStr, err.Error())
return 0, err
}
return CalculateDistanceBySimhash(simhash, another), nil
}
func IsSimhashBinStringDuplicated(simhashStr string, anotherStr string, limit int) (bool, error) {
simhash, err := strconv.ParseUint(simhashStr, Binary, BitsLength)
if err != nil {
fmt.Printf("Cannot convert simHashStr(%s) to uint64 simhash: %s\n", simhashStr, err.Error())
return false, err
}
another, err := strconv.ParseUint(anotherStr, Binary, BitsLength)
if err != nil {
fmt.Printf("Cannot convert anotherStr(%s) to uint64 simhash: %s\n", anotherStr, err.Error())
return false, err
}
return IsSimhashDuplicated(simhash, another, limit), nil
}
func (simhasher *Simhasher) convertWordWeights2HashWeights(wws []jieba.WordWeight, hws []HashWeight) {
for index, ww := range wws {
hws[index].hash = simhasher.hasher.Hash64(ww.Word)
hws[index].weight = ww.Weight
}
}
func getDictPath(dict *string, hmm *string, userDict *string, idf *string, stopWords *string) {
if *dict == "" {
*dict = jieba.DICT_PATH
}
if *hmm == "" {
*hmm = jieba.HMM_PATH
}
if *userDict == "" {
*userDict = jieba.USER_DICT_PATH
}
if *idf == "" {
*idf = jieba.IDF_PATH
}
if *stopWords == "" {
*stopWords = jieba.STOP_WORDS_PATH
}
}