From 0d816f5b8c1a3f209e12c5677dcc0fff29181039 Mon Sep 17 00:00:00 2001 From: Vinicius Mignot Date: Thu, 7 Jan 2021 12:43:59 -0300 Subject: [PATCH] fix(healthcheck) use single timer for all active checks (#62) * fix(healthcheck) use single timer for all active checks * tests(*) removed tests that are not needed --- .travis.yml | 3 +- lib/resty/healthcheck.lua | 158 +++++++++++++++------------ lib/resty/healthcheck/utils.lua | 47 -------- lua-resty-healthcheck-scm-1.rockspec | 6 +- t/01-start-stop.t | 47 -------- t/03-get_target_status.t | 5 +- t/04-report_success.t | 6 +- t/05-report_failure.t | 9 +- t/06-report_http_status.t | 18 +-- t/07-report_tcp_failure.t | 10 +- t/08-report_timeout.t | 10 +- 11 files changed, 98 insertions(+), 221 deletions(-) delete mode 100644 lib/resty/healthcheck/utils.lua diff --git a/.travis.yml b/.travis.yml index 37fa4f44..ece9256b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,8 +48,9 @@ install: - popd - export PATH=$OPENRESTY_PREFIX/nginx/sbin:$LUAROCKS_PREFIX/bin:$PATH - sudo luarocks install luacheck > build.log 2>&1 || (cat build.log && exit 1) - - sudo luarocks install lua-resty-worker-events > build.log 2>&1 || (cat build.log && exit 1) + - sudo luarocks install lua-resty-worker-events 1.0.0 > build.log 2>&1 || (cat build.log && exit 1) - sudo luarocks install penlight > build.log 2>&1 || (cat build.log && exit 1) + - sudo luarocks install lua-resty-timer > build.log 2>&1 || (cat build.log && exit 1) - luarocks --version - nginx -V diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 9358cfe8..ae1baa9c 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -32,13 +32,13 @@ local tostring = tostring local ipairs = ipairs local cjson = require("cjson.safe").new() local table_remove = table.remove -local utils = require("resty.healthcheck.utils") local worker_events = require("resty.worker.events") local resty_lock = require ("resty.lock") local re_find = ngx.re.find local bit = require("bit") local ngx_now = ngx.now local ssl = require("ngx.ssl") +local resty_timer = require "resty.timer" -- constants local EVENT_SOURCE_PREFIX = "lua-resty-healthcheck" @@ -136,6 +136,12 @@ local function dump(...) print(require("pl.pretty").write({...})) end -- luachec local _M = {} +-- checker objects (weak) table +local hcs = setmetatable({}, { + __mode = "v", +}) + +local active_check_timer -- TODO: improve serialization speed -- serialize a table to a string @@ -969,60 +975,46 @@ end --- Active health check callback function. --- @param premature default openresty param -- @param self the checker object this timer runs on -- @param health_mode either "healthy" or "unhealthy" to indicate what check -local function checker_callback(premature, self, health_mode) - if premature or self.stopping then - self.timer_count = self.timer_count - 1 - return +local function checker_callback(self, health_mode) + local list_to_check = {} + local targets = fetch_target_list(self) + for _, target in ipairs(targets) do + local tgt = get_target(self, target.ip, target.port, target.hostname) + local internal_health = tgt and tgt.internal_health or nil + if (health_mode == "healthy" and (internal_health == "healthy" or + internal_health == "mostly_healthy")) + or (health_mode == "unhealthy" and (internal_health == "unhealthy" or + internal_health == "mostly_unhealthy")) + then + list_to_check[#list_to_check + 1] = { + ip = target.ip, + port = target.port, + hostname = target.hostname, + hostheader = target.hostheader, + debug_health = internal_health, + } + end end - local interval - if not self:get_periodic_lock(health_mode) then - -- another worker just ran, or is running the healthcheck - interval = self.checks.active[health_mode].interval - + if not list_to_check[1] then + self:log(DEBUG, "checking ", health_mode, " targets: nothing to do") else - -- we're elected to run the active healthchecks - -- create a list of targets to check, here we can still do this atomically - local start_time = ngx_now() - local list_to_check = {} - local targets = fetch_target_list(self) - for _, target in ipairs(targets) do - local tgt = get_target(self, target.ip, target.port, target.hostname) - local internal_health = tgt and tgt.internal_health or nil - if (health_mode == "healthy" and (internal_health == "healthy" or - internal_health == "mostly_healthy")) - or (health_mode == "unhealthy" and (internal_health == "unhealthy" or - internal_health == "mostly_unhealthy")) - then - list_to_check[#list_to_check + 1] = { - ip = target.ip, - port = target.port, - hostname = target.hostname, - hostheader = target.hostheader, - debug_health = internal_health, - } - end + local timer, err = resty_timer({ + interval = 0, + recurring = false, + immediate = false, + detached = true, + expire = function() + self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) + self:active_check_targets(list_to_check) + self.checks.active[health_mode].last_run = ngx_now() + end, + }) + if timer == nil then + self:log(ERR, "failed to create timer to check ", health_mode) end - - if not list_to_check[1] then - self:log(DEBUG, "checking ", health_mode, " targets: nothing to do") - else - self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) - self:active_check_targets(list_to_check) - end - - local run_time = ngx_now() - start_time - interval = math.max(0, self.checks.active[health_mode].interval - run_time) - end - - -- reschedule timer - local ok, err = utils.gctimer(interval, checker_callback, self, health_mode) - if not ok then - self.timer_count = self.timer_count - 1 - self:log(ERR, "failed to re-create '", health_mode, "' timer: ", err) end end @@ -1117,8 +1109,9 @@ end -- after the current timers have expired they will be marked as stopped. -- @return `true` function checker:stop() - self.stopping = true - self:log(DEBUG, "timers stopped") + self.checks.active.healthy.active = false + self.checks.active.unhealthy.active = false + self:log(DEBUG, "healthchecker stopped") return true end @@ -1126,32 +1119,21 @@ end --- Start the background health checks. -- @return `true`, or `nil + error`. function checker:start() - if self.timer_count ~= 0 then - return nil, "cannot start, " .. self.timer_count .. " (of 2) timers are still running" - end - - local ok, err if self.checks.active.healthy.interval > 0 then - ok, err = utils.gctimer(0, checker_callback, self, "healthy") - if not ok then - return nil, "failed to create 'healthy' timer: " .. err - end - self.timer_count = self.timer_count + 1 + self.checks.active.healthy.active = true + -- the first active check happens only after `interval` + self.checks.active.healthy.last_run = ngx_now() end if self.checks.active.unhealthy.interval > 0 then - ok, err = utils.gctimer(0, checker_callback, self, "unhealthy") - if not ok then - return nil, "failed to create 'unhealthy' timer: " .. err - end - self.timer_count = self.timer_count + 1 + self.checks.active.unhealthy.active = true + self.checks.active.unhealthy.last_run = ngx_now() end worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE) - self.stopping = false -- do this at the end, so if either creation fails, the other stops also - self:log(DEBUG, "timers started") + self:log(DEBUG, "active check flagged as active") return true end @@ -1370,8 +1352,6 @@ function _M.new(opts) -- other properties self.targets = nil -- list of targets, initially loaded, maintained by events self.events = nil -- hash table with supported events (prevent magic strings) - self.stopping = true -- flag to indicate to timers to stop checking - self.timer_count = 0 -- number of running timers self.ev_callback = nil -- callback closure per checker instance -- Convert status lists to sets @@ -1436,13 +1416,49 @@ function _M.new(opts) worker_events:poll() end - -- start timers + -- turn on active health check local ok, err = self:start() if not ok then self:stop() return nil, err end + -- if active checker is needed and not running, start it + if (self.checks.active.healthy.active or self.checks.active.unhealthy.active) + and active_check_timer == nil then + + self:log(DEBUG, "starting active check timer") + local err + active_check_timer, err = resty_timer({ + recurring = true, + interval = 0.1, -- evaluate active checks every 0.1s + detached = false, + expire = function() + local cur_time = ngx_now() + for _, checker_obj in ipairs(hcs) do + if checker_obj.checks.active.healthy.active and + (checker_obj.checks.active.healthy.last_run + + checker_obj.checks.active.healthy.interval <= cur_time) + then + checker_callback(checker_obj, "healthy") + end + + if checker_obj.checks.active.unhealthy.active and + (checker_obj.checks.active.unhealthy.last_run + + checker_obj.checks.active.unhealthy.interval <= cur_time) + then + checker_callback(checker_obj, "unhealthy") + end + end + end, + }) + if not active_check_timer then + self:log(ERR, "Could not start active check timer: ", err) + end + end + + table.insert(hcs, self) + -- TODO: push entire config in debug level logs self:log(DEBUG, "Healthchecker started!") return self diff --git a/lib/resty/healthcheck/utils.lua b/lib/resty/healthcheck/utils.lua deleted file mode 100644 index b95731e7..00000000 --- a/lib/resty/healthcheck/utils.lua +++ /dev/null @@ -1,47 +0,0 @@ --------------------------------------------------------------------------- --- Utilities used by the healthcheck library. --- --- Code originally from https://github.com/Kong/lua-resty-dns-client --- @copyright 2017 Kong Inc. --- @author Hisham Muhammad, Thijs Schreijer --- @license Apache 2.0 - -local timer_at = ngx.timer.at - -local _M = {} - --------------------------------------------------------- --- GC'able timer implementation with 'self' --------------------------------------------------------- -local timer_registry = setmetatable({},{ __mode = "v" }) -local timer_id = 0 - -local timer_callback = function(premature, cb, id, ...) - local self = timer_registry[id] - if not self then return end -- GC'ed, nothing more to do - timer_registry[id] = nil - return cb(premature, self, ...) -end - ---- A garbage-collectible timer implementation. --- Provides a timer that can be attached to an object, and GC'ed along --- with that object, as opposed to regular timers that keep running and --- prevent the object from being GC'ed. --- @param t time in ms --- @param cb callback for the timer. The signature is `function(premature, self, ...)` --- @param self the object the timer belongs to --- @param ... any additional parameters to pass to the timer callback --- @return timer handle (same as `ngx.timer.at`) -_M.gctimer = function(t, cb, self, ...) - assert(type(cb) == "function", "expected arg #2 to be a function") - assert(type(self) == "table", "expected arg #3 to be a table") - timer_id = timer_id + 1 - timer_registry[timer_id] = self - -- if in the call below we'd be passing `self` instead of the scalar `timer_id`, it - -- would prevent the whole `self` object from being garbage collected because - -- it is anchored on the timer. - return timer_at(t, timer_callback, cb, timer_id, ...) -end - - -return _M diff --git a/lua-resty-healthcheck-scm-1.rockspec b/lua-resty-healthcheck-scm-1.rockspec index cca3e690..1922b1cc 100644 --- a/lua-resty-healthcheck-scm-1.rockspec +++ b/lua-resty-healthcheck-scm-1.rockspec @@ -14,13 +14,13 @@ description = { homepage = "https://github.com/Kong/lua-resty-healthcheck" } dependencies = { - "lua-resty-worker-events == 0.3.1", + "lua-resty-worker-events == 1.0.0", "penlight == 1.7.0", + "lua-resty-timer ~> 1", } build = { type = "builtin", modules = { - ["resty.healthcheck"] = "lib/resty/healthcheck.lua", - ["resty.healthcheck.utils"] = "lib/resty/healthcheck/utils.lua", + ["resty.healthcheck"] = "lib/resty/healthcheck.lua", } } diff --git a/t/01-start-stop.t b/t/01-start-stop.t index a3e189e3..f2aa4efd 100644 --- a/t/01-start-stop.t +++ b/t/01-start-stop.t @@ -43,49 +43,15 @@ __DATA__ ngx.sleep(0.2) -- wait twice the interval local ok, err = checker:start() ngx.say(ok) - ngx.say(checker.timer_count) } } --- request GET /t --- response_body true -2 --- no_error_log [error] -=== TEST 2: start() cannot start a second time using active health checks ---- http_config eval: $::HttpConfig ---- config - location = /t { - content_by_lua_block { - local we = require "resty.worker.events" - assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) - local healthcheck = require("resty.healthcheck") - local checker = healthcheck.new({ - name = "testing", - shm_name = "test_shm", - checks = { - active = { - healthy = { - interval = 0.1 - }, - unhealthy = { - interval = 0.1 - } - } - } - }) - local ok, err = checker:start() - ngx.say(err) - } - } ---- request -GET /t ---- response_body -cannot start, 2 (of 2) timers are still running ---- no_error_log -[error] === TEST 3: start() is a no-op if active intervals are 0 --- http_config eval: $::HttpConfig @@ -115,7 +81,6 @@ cannot start, 2 (of 2) timers are still running ngx.say(ok) local ok, err = checker:start() ngx.say(ok) - ngx.say(checker.timer_count) } } --- request @@ -124,7 +89,6 @@ GET /t true true true -0 --- no_error_log [error] @@ -152,17 +116,12 @@ true }) local ok, err = checker:stop() ngx.say(ok) - ngx.say(checker.timer_count) - ngx.sleep(0.2) -- wait twice the interval - ngx.say(checker.timer_count) } } --- request GET /t --- response_body true -2 -0 --- no_error_log [error] checking @@ -191,12 +150,9 @@ checking }) local ok, err = checker:stop() ngx.say(ok) - ngx.say(checker.timer_count) ngx.sleep(0.2) -- wait twice the interval - ngx.say(checker.timer_count) local ok, err = checker:start() ngx.say(ok) - ngx.say(checker.timer_count) ngx.sleep(0.2) -- wait twice the interval } } @@ -204,9 +160,6 @@ checking GET /t --- response_body true -2 -0 true -2 --- error_log checking diff --git a/t/03-get_target_status.t b/t/03-get_target_status.t index 8be6c4bd..d7e0b4d5 100644 --- a/t/03-get_target_status.t +++ b/t/03-get_target_status.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * (blocks() * 4) + 2; +plan tests => repeat_each() * (blocks() * 4); my $pwd = cwd(); @@ -77,9 +77,6 @@ GET /t true false true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log checking healthy targets: #1 checking unhealthy targets: #1 diff --git a/t/04-report_success.t b/t/04-report_success.t index 91b36a10..8fb5fb6e 100644 --- a/t/04-report_success.t +++ b/t/04-report_success.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 32; +plan tests => repeat_each() * 28; my $pwd = cwd(); @@ -84,8 +84,6 @@ GET /t true true --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do healthy SUCCESS increment (1/3) for '(127.0.0.1:2116)' healthy SUCCESS increment (2/3) for '(127.0.0.1:2116)' healthy SUCCESS increment (3/3) for '(127.0.0.1:2116)' @@ -161,8 +159,6 @@ GET /t true true --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do healthy SUCCESS increment (1/3) for '(127.0.0.1:2116)' healthy SUCCESS increment (2/3) for '(127.0.0.1:2116)' healthy SUCCESS increment (3/3) for '(127.0.0.1:2116)' diff --git a/t/05-report_failure.t b/t/05-report_failure.t index 9b95c395..47f8c7e9 100644 --- a/t/05-report_failure.t +++ b/t/05-report_failure.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * (blocks() * 11) - 1; +plan tests => repeat_each() * 26; my $pwd = cwd(); @@ -84,8 +84,6 @@ GET /t false false --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do unhealthy HTTP increment (1/3) for '(127.0.0.1:2117)' unhealthy HTTP increment (2/3) for '(127.0.0.1:2117)' unhealthy HTTP increment (3/3) for '(127.0.0.1:2117)' @@ -161,8 +159,6 @@ GET /t false false --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do unhealthy TCP increment (1/2) for '(127.0.0.1:2117)' unhealthy TCP increment (2/2) for '(127.0.0.1:2117)' event: target status '(127.0.0.1:2117)' from 'true' to 'false' @@ -235,9 +231,6 @@ GET /t --- response_body true true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy TCP increment (1/2) for '(127.0.0.1:2117)' unhealthy TCP increment (2/2) for '(127.0.0.1:2117)' diff --git a/t/06-report_http_status.t b/t/06-report_http_status.t index ef19643b..0d2c1cdd 100644 --- a/t/06-report_http_status.t +++ b/t/06-report_http_status.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 53; +plan tests => repeat_each() * 41; my $pwd = cwd(); @@ -84,8 +84,6 @@ GET /t false false --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do unhealthy HTTP increment (1/3) for '(127.0.0.1:2119)' unhealthy HTTP increment (2/3) for '(127.0.0.1:2119)' unhealthy HTTP increment (3/3) for '(127.0.0.1:2119)' @@ -164,8 +162,6 @@ GET /t true true --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do healthy SUCCESS increment (1/4) for '(127.0.0.1:2119)' healthy SUCCESS increment (2/4) for '(127.0.0.1:2119)' healthy SUCCESS increment (3/4) for '(127.0.0.1:2119)' @@ -237,9 +233,6 @@ qq{ GET /t --- response_body false ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log healthy SUCCESS increment event: target status '127.0.0.1 (127.0.0.1:2119)' from 'false' to 'true' @@ -304,9 +297,6 @@ qq{ GET /t --- response_body false ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log healthy SUCCESS increment event: target status '127.0.0.1 (127.0.0.1:2119)' from 'false' to 'true' @@ -371,9 +361,6 @@ qq{ GET /t --- response_body true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy HTTP increment event: target status '127.0.0.1 (127.0.0.1:2119)' from 'true' to 'false' @@ -438,9 +425,6 @@ qq{ GET /t --- response_body true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy HTTP increment event: target status '(127.0.0.1:2119)' from 'true' to 'false' diff --git a/t/07-report_tcp_failure.t b/t/07-report_tcp_failure.t index f0d8b6b4..9e4e0e7c 100644 --- a/t/07-report_tcp_failure.t +++ b/t/07-report_tcp_failure.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 24; +plan tests => repeat_each() * 18; my $pwd = cwd(); @@ -84,8 +84,6 @@ GET /t false false --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do unhealthy TCP increment (1/3) for '(127.0.0.1:2120)' unhealthy TCP increment (2/3) for '(127.0.0.1:2120)' unhealthy TCP increment (3/3) for '(127.0.0.1:2120)' @@ -154,9 +152,6 @@ qq{ GET /t --- response_body true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy TCP increment event: target status '(127.0.0.1:2120)' from 'true' to 'false' @@ -221,9 +216,6 @@ qq{ GET /t --- response_body true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy TCP increment event: target status '(127.0.0.1:2120)' from 'true' to 'false' diff --git a/t/08-report_timeout.t b/t/08-report_timeout.t index 7f4ef911..fb61ea18 100644 --- a/t/08-report_timeout.t +++ b/t/08-report_timeout.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 22; +plan tests => repeat_each() * 16; my $pwd = cwd(); @@ -84,8 +84,6 @@ GET /t false false --- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do unhealthy TIMEOUT increment (1/2) for '(127.0.0.1:2122)' unhealthy TIMEOUT increment (2/2) for '(127.0.0.1:2122)' event: target status '(127.0.0.1:2122)' from 'true' to 'false' @@ -154,9 +152,6 @@ qq{ GET /t --- response_body true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy TCP increment event: target status '(127.0.0.1:2122)' from 'true' to 'false' @@ -223,9 +218,6 @@ qq{ GET /t --- response_body true ---- error_log -checking healthy targets: nothing to do -checking unhealthy targets: nothing to do --- no_error_log unhealthy TCP increment event: target status '(127.0.0.1:2122)' from 'true' to 'false'