forked from philpearl/intern
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintern.go
177 lines (156 loc) · 4.69 KB
/
intern.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Package intern is a string interning library. Pass it a string, and it will store it and return it, removing duplicates. That is, however many times you show it a string, it will only store that string once, and will always
// return a version of it backed by the same memory.
//
// Storage is kind to GC. It is optimised for storing a very large number of strings.
package intern
import (
"math/bits"
"github.com/philpearl/aeshash"
"github.com/ou05020/stringbank"
)
// Intern implements the interner. Allocate it
type Intern struct {
stringbank.Stringbank
table table
oldTable table
count int
oldTableCursor int
}
// New creates a new interning table
func New(cap int) *Intern {
if cap < 16 {
cap = 16
} else {
cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1)))
}
return &Intern{
table: table{
hashes: make([]uint32, cap),
indices: make([]int, cap),
},
}
}
// Len returns the number of unique strings stored
func (i *Intern) Len() int {
return i.count
}
// Cap returns the size of the intern table
func (i *Intern) Cap() int {
return i.table.len()
}
// Deduplicate takes a string and returns a permanently stored version. This will always
// be backed by the same memory for the same string.
func (i *Intern) Deduplicate(val string) string {
return i.Get(i.Save(val))
}
// Save stores a string in out deduplicated string store, and returns an integer offset
// for accessing it.
func (i *Intern) Save(val string) int {
// we use a hashtable where the keys are stringbank offsets, but comparisons are done on
// strings. There is no value to store
i.resize()
hash := aeshash.Hash(val)
if i.oldTable.len() != 0 {
_, index := i.findInTable(i.oldTable, val, hash)
if index != 0 {
return index - 1
}
}
cursor, index := i.findInTable(i.table, val, hash)
if index != 0 {
return index - 1
}
// String was not found, so we want to store it. Cursor is the index where we should
// store it
offset := i.Stringbank.Save(val)
i.table.hashes[cursor] = hash
i.table.indices[cursor] = offset + 1
i.count++
return offset
}
// findInTable find the string val in the hash table. If the string is present, it returns the
// place in the table where it was found, plus the stringbank offset of the string + 1
func (i *Intern) findInTable(table table, val string, hashVal uint32) (cursor int, index int) {
l := table.len()
cursor = int(hashVal) & (l - 1)
start := cursor
for table.indices[cursor] != 0 {
if table.hashes[cursor] == hashVal {
if index := int(table.indices[cursor]); i.Get(index-1) == val {
return cursor, index
}
}
cursor++
if cursor == l {
cursor = 0
}
if cursor == start {
panic("out of space!")
}
}
return cursor, 0
}
func (i *Intern) copyEntryToTable(table table, index int, hash uint32) {
l := table.len()
cursor := int(hash) & (l - 1)
start := cursor
for table.indices[cursor] != 0 {
// the entry we're copying in is guaranteed not to be already
// present, so we're just looking for an empty space
cursor++
if cursor == l {
cursor = 0
}
if cursor == start {
panic("out of space (resize)!")
}
}
table.indices[cursor] = index
table.hashes[cursor] = hash
}
func (i *Intern) resize() {
if i.table.hashes == nil {
i.table.hashes = make([]uint32, 16)
i.table.indices = make([]int, 16)
}
if i.count < i.table.len()*3/4 && i.oldTable.len() == 0 {
return
}
if i.oldTable.hashes == nil {
i.oldTable, i.table = i.table, table{
hashes: make([]uint32, len(i.table.hashes)*2),
indices: make([]int, len(i.table.indices)*2),
}
}
// We copy items between tables 16 at a time. Since we do this every time
// anyone writes to the table we won't run out of space in the new table
// before this is complete
l := i.oldTable.len()
for k := 0; k < 16; k++ {
if index := i.oldTable.indices[k+i.oldTableCursor]; index != 0 {
i.copyEntryToTable(i.table, index, i.oldTable.hashes[k+i.oldTableCursor])
// The entry can exist in the old and new versions of the table without
// problems. If we did try to delete from the old table we'd have issues
// searching forward from clashing entries.
}
}
i.oldTableCursor += 16
if i.oldTableCursor >= l {
i.oldTable.hashes = nil
i.oldTable.indices = nil
i.oldTableCursor = 0
}
}
// table represents a hash table. We keep the indices and hashes separate in
// case we want to use different size types in the future
type table struct {
// We keep hashes in the table to speed up resizing, and also stepping through
// entries that have different hashes but hit the same bucket
hashes []uint32
// index is the index of the string in the stringbank, plus 1 so that valid
// entries are never zero
indices []int
}
func (t table) len() int {
return len(t.hashes)
}