-
Notifications
You must be signed in to change notification settings - Fork 0
/
countmin.lua
166 lines (136 loc) · 4.21 KB
/
countmin.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
local math = require("math")
local xxhash = require("xxhash")
local hash = xxhash.xxh32
math.randomseed(os.time())
local new = function(epsilon, delta)
assert(type(epsilon) == "number", "epsilon must be a number")
assert(epsilon > 0, "epsilon must be bigger than zero")
assert(type(delta) == "number", "delta must be a number")
assert(delta> 0, "delta must be bigger than zero")
-- Count Min Sketch variables
local w = math.ceil(2.718281828459045 / epsilon)
local d = math.ceil(math.log(1 / delta))
local l = d * w
-- main datastructure
local array = {}
-- hash variables
local seed1
local seed2
-- counter
local addedItems
local maxval = 2^53 - 1
-- temporary index table
local temp_index = {}
for x = 1, d do
temp_index[x] = 0
end
local query = function(input, update)
assert(type(input) == "string", 'input must be a string')
assert(type(update) == "boolean", 'update must be a boolean')
-- make two different hashvalues from the input with different seeds
local h1 = hash(input, seed1)
local h2 = hash(input, seed2)
-- track whehter or not we have added a new value, 0 if added 1 if not added
local existed = 1
-- create hashes and compute array index and bit index for them to get
-- the individual bits
local min = maxval
for i = 1, d do
-- set offset, we are using one big array instead of many smaller ones
local offset = (i - 1) * w
-- use enhanced double hashing (Kirsh & Mitzenmacher) to crete the hash
-- value used for the array index. +1 to account for Lua arrays
local x = ((h1 + (i * h2) + i^2) % w) + 1
temp_index[i] = x
local num = array[x + offset]
-- if the bit is zero we know did not exist already just return if we are
-- not updating it
if num == 0 then
existed = 0
if not update then
return 0
end
end
-- update the smallest seen value
if num < min then
min = num
end
end
-- update the item counter if we added a new item
if update then
-- conservative update, only update values that smaller than min + 1
for i = 1, d do
local offset = (i - 1) * w
local index = temp_index[i] + offset
if array[index] < (min + 1) and min < maxval then
array[index] = array[index] + 1
end
end
-- update min in order to return the updated count
min = min + 1
-- update the items counter if we addded a new item
if existed == 0 then
addedItems = addedItems + 1
end
end
return min
end
--[[
Adds a new entry to the set. returns 1 if the element
already existed and 0 if it does not exist in the set.
]]
local add = function(input)
return query(input, true)
end
--[[
checks whether or not an element is in the set.
returns 1 if the element exists and 0 if it does
not exist in the set.
]]
local check = function(input)
return query(input, false)
end
--[[ resets the arrays used in the filter ]]
local reset = function()
addedItems = 0
-- since 2*rand and 3*rand is done modulo a prime these number will always
-- be different and should therefore safe to use as seeds to generate two
-- different hashes for the same input.
local rand = math.random(-2147483648, 2147483647)
seed1 = (2*rand) % 2147483647
seed2 = (3*rand) % 2147483647
for x = 1, l do
array[x] = 0
end
end
-- [[ get number of unique added items ]]
local getNumItems = function()
return addedItems
end
-- [[ get number of bits this instance uses ]]
local getDepth = function()
return d
end
-- [[ get number of keys this instance uses ]]
local getWidth = function()
return w
end
-- [[ get number of keys this instance uses ]]
local getAccumulatedError = function()
return addedItems * epsilon
end
-- reset the array to initialize this instance
reset()
-- methods we expose for this instance
return {
add = add,
check = check,
reset = reset,
getNumItems = getNumItems,
getDepth = getDepth,
getWidth = getWidth,
getAccumulatedError = getAccumulatedError
}
end
-- methods we expose for this module
return { new = new }