From e434007e8fe40e3894758907faa9d31db4227831 Mon Sep 17 00:00:00 2001 From: Vinicius Mignot Date: Wed, 6 Jul 2022 16:45:26 -0300 Subject: [PATCH] fix(healthcheck) lower the cleanup check frequency the health-check timer also checks if targets must be removed. to safely remove targets, the targets list is locked. if this check runs on every health-check cycle and there are a large number of targets, a bazillion locks will be created. this change avoids that by lowering the frequency the cleanup list is checked. the side-effect is that targets marked for cleanup may exist for more time (2.5s) than expected, and some unexpected active checks could happen. --- lib/resty/healthcheck.lua | 55 +++++++++++++++++++++++---------------- t/11-clear.t | 4 +-- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 2df9a245..3922d582 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -108,6 +108,8 @@ local CHECK_JITTER = CHECK_INTERVAL * 0.1 -- the check interval. If it does not update the shm during this period, we -- consider that it is not able to continue checking (the worker probably was killed) local LOCK_PERIOD = CHECK_INTERVAL * 15 +-- interval between stale targets cleanup +local CLEANUP_INTERVAL = CHECK_INTERVAL * 25 -- Counters: a 32-bit shm integer can hold up to four 8-bit counters. local CTR_SUCCESS = 0x00000001 @@ -200,6 +202,7 @@ local hcs = setmetatable({}, { }) local active_check_timer +local last_cleanup_check -- TODO: improve serialization speed -- serialize a table to a string @@ -1630,6 +1633,7 @@ function _M.new(opts) self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ", "starting active check timer") local shm, key = self.shm, self.PERIODIC_LOCK + last_cleanup_check = ngx_now() active_check_timer, err = resty_timer({ recurring = true, interval = CHECK_INTERVAL, @@ -1647,34 +1651,39 @@ function _M.new(opts) local cur_time = ngx_now() for _, checker_obj in pairs(hcs) do - -- clear targets marked for delayed removal - locking_target_list(checker_obj, function(target_list) - local removed_targets = {} - local index = 1 - while index <= #target_list do - local target = target_list[index] - if target.purge_time and target.purge_time <= cur_time then - table_insert(removed_targets, target) - table_remove(target_list, index) - else - index = index + 1 + + if (last_cleanup_check + CLEANUP_INTERVAL) < cur_time then + -- clear targets marked for delayed removal + locking_target_list(checker_obj, function(target_list) + local removed_targets = {} + local index = 1 + while index <= #target_list do + local target = target_list[index] + if target.purge_time and target.purge_time <= cur_time then + table_insert(removed_targets, target) + table_remove(target_list, index) + else + index = index + 1 + end end - end - if #removed_targets > 0 then - target_list = serialize(target_list) + if #removed_targets > 0 then + target_list = serialize(target_list) - local ok, err = shm:set(checker_obj.TARGET_LIST, target_list) - if not ok then - return nil, "failed to store target_list in shm: " .. err - end + local ok, err = shm:set(checker_obj.TARGET_LIST, target_list) + if not ok then + return nil, "failed to store target_list in shm: " .. err + end - for _, target in ipairs(removed_targets) do - clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname) - checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname) + for _, target in ipairs(removed_targets) do + clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname) + checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname) + end end - end - end) + end) + + last_cleanup_check = cur_time + end if checker_obj.checks.active.healthy.active and (checker_obj.checks.active.healthy.last_run + diff --git a/t/11-clear.t b/t/11-clear.t index b25822c3..0ddb02d5 100644 --- a/t/11-clear.t +++ b/t/11-clear.t @@ -202,7 +202,7 @@ unhealthy HTTP increment (3/3) for '(127.0.0.1:21120)' local checker2 = healthcheck.new(config) ngx.say(checker2:get_target_status("127.0.0.1", 10001)) - ngx.sleep(0.4) -- wait while the targets are cleared + ngx.sleep(2.6) -- wait while the targets are cleared local status, err = checker2:get_target_status("127.0.0.1", 10001) if status ~= nil then ngx.say(status) @@ -251,7 +251,7 @@ target not found local checker2 = healthcheck.new(config) checker2:add_target("127.0.0.1", 10002, nil, true) ngx.say(checker2:get_target_status("127.0.0.1", 10002)) - ngx.sleep(0.4) -- wait while the targets would be cleared + ngx.sleep(2.6) -- wait while the targets would be cleared local status, err = checker2:get_target_status("127.0.0.1", 10001) if status ~= nil then ngx.say(status)