Skip to content

Commit

Permalink
fix(healthcheck) lower the cleanup check frequency
Browse files Browse the repository at this point in the history
the health-check timer also checks if targets must be removed. to
safely remove targets, the targets list is locked. if this check runs
on every health-check cycle and there are a large number of targets, a
bazillion locks will be created. this change avoids that by lowering
the frequency the cleanup list is checked.

the side-effect is that targets marked for cleanup may exist for more
time (2.5s) than expected, and some unexpected active checks could
happen.
  • Loading branch information
locao committed Jul 7, 2022
1 parent 47f8709 commit e434007
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 25 deletions.
55 changes: 32 additions & 23 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ local CHECK_JITTER = CHECK_INTERVAL * 0.1
-- the check interval. If it does not update the shm during this period, we
-- consider that it is not able to continue checking (the worker probably was killed)
local LOCK_PERIOD = CHECK_INTERVAL * 15
-- interval between stale targets cleanup
local CLEANUP_INTERVAL = CHECK_INTERVAL * 25

-- Counters: a 32-bit shm integer can hold up to four 8-bit counters.
local CTR_SUCCESS = 0x00000001
Expand Down Expand Up @@ -200,6 +202,7 @@ local hcs = setmetatable({}, {
})

local active_check_timer
local last_cleanup_check

-- TODO: improve serialization speed
-- serialize a table to a string
Expand Down Expand Up @@ -1630,6 +1633,7 @@ function _M.new(opts)
self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ",
"starting active check timer")
local shm, key = self.shm, self.PERIODIC_LOCK
last_cleanup_check = ngx_now()
active_check_timer, err = resty_timer({
recurring = true,
interval = CHECK_INTERVAL,
Expand All @@ -1647,34 +1651,39 @@ function _M.new(opts)

local cur_time = ngx_now()
for _, checker_obj in pairs(hcs) do
-- clear targets marked for delayed removal
locking_target_list(checker_obj, function(target_list)
local removed_targets = {}
local index = 1
while index <= #target_list do
local target = target_list[index]
if target.purge_time and target.purge_time <= cur_time then
table_insert(removed_targets, target)
table_remove(target_list, index)
else
index = index + 1

if (last_cleanup_check + CLEANUP_INTERVAL) < cur_time then
-- clear targets marked for delayed removal
locking_target_list(checker_obj, function(target_list)
local removed_targets = {}
local index = 1
while index <= #target_list do
local target = target_list[index]
if target.purge_time and target.purge_time <= cur_time then
table_insert(removed_targets, target)
table_remove(target_list, index)
else
index = index + 1
end
end
end

if #removed_targets > 0 then
target_list = serialize(target_list)
if #removed_targets > 0 then
target_list = serialize(target_list)

local ok, err = shm:set(checker_obj.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end
local ok, err = shm:set(checker_obj.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

for _, target in ipairs(removed_targets) do
clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname)
checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname)
for _, target in ipairs(removed_targets) do
clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname)
checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname)
end
end
end
end)
end)

last_cleanup_check = cur_time
end

if checker_obj.checks.active.healthy.active and
(checker_obj.checks.active.healthy.last_run +
Expand Down
4 changes: 2 additions & 2 deletions t/11-clear.t
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ unhealthy HTTP increment (3/3) for '(127.0.0.1:21120)'

local checker2 = healthcheck.new(config)
ngx.say(checker2:get_target_status("127.0.0.1", 10001))
ngx.sleep(0.4) -- wait while the targets are cleared
ngx.sleep(2.6) -- wait while the targets are cleared
local status, err = checker2:get_target_status("127.0.0.1", 10001)
if status ~= nil then
ngx.say(status)
Expand Down Expand Up @@ -251,7 +251,7 @@ target not found
local checker2 = healthcheck.new(config)
checker2:add_target("127.0.0.1", 10002, nil, true)
ngx.say(checker2:get_target_status("127.0.0.1", 10002))
ngx.sleep(0.4) -- wait while the targets would be cleared
ngx.sleep(2.6) -- wait while the targets would be cleared
local status, err = checker2:get_target_status("127.0.0.1", 10001)
if status ~= nil then
ngx.say(status)
Expand Down

0 comments on commit e434007

Please sign in to comment.