Skip to content

Commit

Permalink
feat(balancer) keep target health state on config updates (#8394)
Browse files Browse the repository at this point in the history
* chore(rockspec) bump lua-resty-healthcheck version

* feat(balancer) keep target health status

this commit uses the new delayed_clear() fn from lua-resty-healthcheck
to keep the target health status between config changes.

* docs(CHANGELOG) feature description

* tests(balancer) increase test probability to pass

Frequently the least-connections algorithm balances the traffic evenly
between targets. This change tries to make the test less flaky.
  • Loading branch information
locao authored Feb 14, 2022
1 parent 716d8d8 commit 4ceda40
Show file tree
Hide file tree
Showing 13 changed files with 201 additions and 43 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
- Routes now support matching headers with regular expressions
Thanks, [@vanhtuan0409](https://github.com/vanhtuan0409)!
[#6079](https://github.com/Kong/kong/pull/6079)
- Targets keep their health status when upstreams are updated.
[#8394](https://github.com/Kong/kong/pull/8394)

#### Performance

Expand Down
2 changes: 1 addition & 1 deletion kong-2.7.0-0.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ dependencies = {
"luaxxhash >= 1.0",
"lua-protobuf == 0.3.3",
"lua-resty-worker-events == 1.0.0",
"lua-resty-healthcheck == 1.4.2",
"lua-resty-healthcheck == 1.5.0",
"lua-resty-mlcache == 2.5.0",
"lua-messagepack == 0.5.2",
"lua-resty-openssl == 0.8.5",
Expand Down
2 changes: 2 additions & 0 deletions kong/constants.lua
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ local constants = {
CLUSTERING_TIMEOUT = 5000, -- 5 seconds
CLUSTERING_PING_INTERVAL = 30, -- 30 seconds
CLUSTERING_OCSP_TIMEOUT = 5000, -- 5 seconds

CLEAR_HEALTH_STATUS_DELAY = 300, -- 300 seconds
}

for _, v in ipairs(constants.CLUSTERING_SYNC_STATUS) do
Expand Down
15 changes: 10 additions & 5 deletions kong/runloop/balancer/balancers.lua
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ local upstreams = require "kong.runloop.balancer.upstreams"
local targets
local healthcheckers
local dns_utils = require "kong.resty.dns.utils"
local constants = require "kong.constants"

local ngx = ngx
local log = ngx.log
Expand All @@ -25,6 +26,7 @@ local DEBUG = ngx.DEBUG
local TTL_0_RETRY = 60 -- Maximum life-time for hosts added with ttl=0, requery after it expires
local REQUERY_INTERVAL = 30 -- Interval for requerying failed dns queries
local SRV_0_WEIGHT = 1 -- SRV record with weight 0 should be hit minimally, hence we replace by 1
local CLEAR_HEALTH_STATUS_DELAY = constants.CLEAR_HEALTH_STATUS_DELAY


local balancers_M = {}
Expand Down Expand Up @@ -90,6 +92,7 @@ local function wait(id)
return nil, "timeout"
end


------------------------------------------------------------------------------
-- The mutually-exclusive section used internally by the
-- 'create_balancer' operation.
Expand Down Expand Up @@ -176,7 +179,7 @@ function balancers_M.create_balancer(upstream, recreate)
local existing_balancer = balancers_by_id[upstream.id]
if existing_balancer then
if recreate then
healthcheckers.stop_healthchecker(existing_balancer)
healthcheckers.stop_healthchecker(existing_balancer, CLEAR_HEALTH_STATUS_DELAY)
else
return existing_balancer
end
Expand Down Expand Up @@ -431,10 +434,12 @@ function balancer_mt:deleteDisabledAddresses(target)
local addr = addresses[i]

if addr.disabled then
self:callback("removed", addr, addr.ip, addr.port,
target.name, addr.hostHeader)
dirty = true
table_remove(addresses, i)
if type(self.callback) == "function" then
self:callback("removed", addr, addr.ip, addr.port,
target.name, addr.hostHeader)
end
dirty = true
table_remove(addresses, i)
end
end

Expand Down
71 changes: 53 additions & 18 deletions kong/runloop/balancer/healthcheckers.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,16 @@ function healthcheckers_M.init()
end


function healthcheckers_M.stop_healthchecker(balancer)
function healthcheckers_M.stop_healthchecker(balancer, delay)
local healthchecker = balancer.healthchecker
if healthchecker then
local ok, err = healthchecker:clear()
local ok, err
if delay and delay > 0 then
ok, err = healthchecker:delayed_clear(delay)
else
ok, err = healthchecker:clear()
end

if not ok then
log(ERR, "[healthchecks] error clearing healthcheck data: ", err)
end
Expand Down Expand Up @@ -124,7 +130,6 @@ end

-- @param hc The healthchecker object
-- @param balancer The balancer object
-- @param upstream_id The upstream id
local function attach_healthchecker_to_balancer(hc, balancer)
local function hc_callback(tgt, event)
local status
Expand Down Expand Up @@ -189,6 +194,24 @@ local function attach_healthchecker_to_balancer(hc, balancer)
end


-- add empty healthcheck functions to balancer when hc is not used
local function populate_balancer(balancer)
balancer.report_http_status = function()
return true
end

balancer.report_tcp_failure = function()
return true
end

balancer.report_timeout = function()
return true
end

return true
end


local parsed_cert, parsed_key
local function parse_global_cert_and_key()
if not parsed_cert then
Expand All @@ -199,6 +222,21 @@ local function parse_global_cert_and_key()

return parsed_cert, parsed_key
end


local function is_upstream_using_healthcheck(upstream)
if upstream ~= nil then
return upstream.healthchecks.active.healthy.interval ~= 0
or upstream.healthchecks.active.unhealthy.interval ~= 0
or upstream.healthchecks.passive.unhealthy.tcp_failures ~= 0
or upstream.healthchecks.passive.unhealthy.timeouts ~= 0
or upstream.healthchecks.passive.unhealthy.http_failures ~= 0
end

return false
end


----------------------------------------------------------------------------
-- Create a healthchecker object.
-- @param upstream An upstream entity table.
Expand All @@ -213,6 +251,10 @@ function healthcheckers_M.create_healthchecker(balancer, upstream)
checks.active.unhealthy.interval = 0
end

if not is_upstream_using_healthcheck(upstream) then
return populate_balancer(balancer)
end

local ssl_cert, ssl_key
if upstream.client_certificate then
local cert, err = get_certificate(upstream.client_certificate)
Expand Down Expand Up @@ -251,19 +293,6 @@ function healthcheckers_M.create_healthchecker(balancer, upstream)
end


local function is_upstream_using_healthcheck(upstream)
if upstream ~= nil then
return upstream.healthchecks.active.healthy.interval ~= 0
or upstream.healthchecks.active.unhealthy.interval ~= 0
or upstream.healthchecks.passive.unhealthy.tcp_failures ~= 0
or upstream.healthchecks.passive.unhealthy.timeouts ~= 0
or upstream.healthchecks.passive.unhealthy.http_failures ~= 0
end

return false
end


--------------------------------------------------------------------------------
-- Get healthcheck information for an upstream.
-- @param upstream_id the id of the upstream.
Expand Down Expand Up @@ -378,11 +407,17 @@ function healthcheckers_M.unsubscribe_from_healthcheck_events(callback)
end


function healthcheckers_M.stop_healthcheckers()
--------------------------------------------------------------------------------
-- Stop all health checkers.
-- @param delay Delay before actually removing the health checker from memory.
-- When a upstream with the same targets might be created right after stopping
-- the health checker, this parameter is useful to avoid throwing away current
-- health status.
function healthcheckers_M.stop_healthcheckers(delay)
for _, id in pairs(upstreams.get_all_upstreams()) do
local balancer = balancers.get_balancer_by_id(id)
if balancer then
healthcheckers_M.stop_healthchecker(balancer)
healthcheckers_M.stop_healthchecker(balancer, delay)
end

balancers.set_balancer(id, nil)
Expand Down
4 changes: 3 additions & 1 deletion kong/runloop/balancer/upstreams.lua
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
---
local singletons = require "kong.singletons"
local workspaces = require "kong.workspaces"
local constants = require "kong.constants"
local balancers
local healthcheckers

Expand All @@ -22,6 +23,7 @@ local CRIT = ngx.CRIT
local ERR = ngx.ERR

local GLOBAL_QUERY_OPTS = { workspace = null, show_ws_id = true }
local CLEAR_HEALTH_STATUS_DELAY = constants.CLEAR_HEALTH_STATUS_DELAY


local upstreams_M = {}
Expand Down Expand Up @@ -196,7 +198,7 @@ local function do_upstream_event(operation, upstream_data)

local balancer = balancers.get_balancer_by_id(upstream_id)
if balancer then
healthcheckers.stop_healthchecker(balancer)
healthcheckers.stop_healthchecker(balancer, CLEAR_HEALTH_STATUS_DELAY)
end

if operation == "delete" then
Expand Down
3 changes: 2 additions & 1 deletion kong/runloop/handler.lua
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ local HOST_PORTS = {}


local SUBSYSTEMS = constants.PROTOCOLS_WITH_SUBSYSTEM
local CLEAR_HEALTH_STATUS_DELAY = constants.CLEAR_HEALTH_STATUS_DELAY
local TTL_ZERO = { ttl = 0 }


Expand Down Expand Up @@ -366,7 +367,7 @@ local function register_events()
end

local ok, err = concurrency.with_coroutine_mutex(FLIP_CONFIG_OPTS, function()
balancer.stop_healthcheckers()
balancer.stop_healthcheckers(CLEAR_HEALTH_STATUS_DELAY)

kong.cache:flip()
core_cache:flip()
Expand Down
10 changes: 5 additions & 5 deletions spec/01-unit/09-balancer/03-consistent_hashing_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ describe("[consistent_hashing]", function()
end)
it("weight change for unresolved record, updates properly", function()
local record = dnsA({
{ name = "really.really.really.does.not.exist.thijsschreijer.nl", address = "1.2.3.4" },
{ name = "really.really.really.does.not.exist.host.test", address = "1.2.3.4" },
})
dnsAAAA({
{ name = "getkong.org", address = "::1" },
Expand All @@ -832,7 +832,7 @@ describe("[consistent_hashing]", function()
wheelSize = 1000,
requery = 1,
})
add_target(b, "really.really.really.does.not.exist.thijsschreijer.nl", 80, 10)
add_target(b, "really.really.really.does.not.exist.host.test", 80, 10)
add_target(b, "getkong.org", 80, 10)
local count = count_indices(b)
assert.same({
Expand All @@ -844,7 +844,7 @@ describe("[consistent_hashing]", function()
record.expire = 0
record.expired = true
-- do a lookup to trigger the async lookup
client.resolve("really.really.really.does.not.exist.thijsschreijer.nl", {qtype = client.TYPE_A})
client.resolve("really.really.really.does.not.exist.host.test", {qtype = client.TYPE_A})
sleep(1) -- provide time for async lookup to complete

--b:_hit_all() -- hit them all to force renewal
Expand All @@ -857,10 +857,10 @@ describe("[consistent_hashing]", function()
}, count)

-- update the failed record
add_target(b, "really.really.really.does.not.exist.thijsschreijer.nl", 80, 20)
add_target(b, "really.really.really.does.not.exist.host.test", 80, 20)
-- reinsert a cache entry
dnsA({
{ name = "really.really.really.does.not.exist.thijsschreijer.nl", address = "1.2.3.4" },
{ name = "really.really.really.does.not.exist.host.test", address = "1.2.3.4" },
})
--sleep(2) -- wait for timer to re-resolve the record
targets.resolve_targets(b.targets)
Expand Down
16 changes: 8 additions & 8 deletions spec/01-unit/09-balancer/04-round_robin_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ describe("[round robin balancer]", function()
dns = client,
wheelSize = 15,
})
assert(add_target(b, "really.really.really.does.not.exist.thijsschreijer.nl", 80, 10))
assert(add_target(b, "really.really.really.does.not.exist.hostname.test", 80, 10))
check_balancer(b)
assert.equals(0, b.totalWeight) -- has one failed host, so weight must be 0
dnsA({
Expand Down Expand Up @@ -1045,7 +1045,7 @@ describe("[round robin balancer]", function()
end)
it("weight change for unresolved record, updates properly", function()
local record = dnsA({
{ name = "really.really.really.does.not.exist.thijsschreijer.nl", address = "1.2.3.4" },
{ name = "really.really.really.does.not.exist.hostname.test", address = "1.2.3.4" },
})
dnsAAAA({
{ name = "getkong.test", address = "::1" },
Expand All @@ -1055,7 +1055,7 @@ describe("[round robin balancer]", function()
wheelSize = 60,
requery = 0.1,
})
add_target(b, "really.really.really.does.not.exist.thijsschreijer.nl", 80, 10)
add_target(b, "really.really.really.does.not.exist.hostname.test", 80, 10)
add_target(b, "getkong.test", 80, 10)
local count = count_indices(b)
assert.same({
Expand All @@ -1067,7 +1067,7 @@ describe("[round robin balancer]", function()
record.expire = 0
record.expired = true
-- do a lookup to trigger the async lookup
client.resolve("really.really.really.does.not.exist.thijsschreijer.nl", {qtype = client.TYPE_A})
client.resolve("really.really.really.does.not.exist.hostname.test", {qtype = client.TYPE_A})
sleep(0.5) -- provide time for async lookup to complete

for _ = 1, b.wheelSize do b:getPeer() end -- hit them all to force renewal
Expand All @@ -1079,10 +1079,10 @@ describe("[round robin balancer]", function()
}, count)

-- update the failed record
add_target(b, "really.really.really.does.not.exist.thijsschreijer.nl", 80, 20)
add_target(b, "really.really.really.does.not.exist.hostname.test", 80, 20)
-- reinsert a cache entry
dnsA({
{ name = "really.really.really.does.not.exist.thijsschreijer.nl", address = "1.2.3.4" },
{ name = "really.really.really.does.not.exist.hostname.test", address = "1.2.3.4" },
})
sleep(2) -- wait for timer to re-resolve the record
targets.resolve_targets(b.targets)
Expand Down Expand Up @@ -1303,9 +1303,9 @@ describe("[round robin balancer]", function()
end)
it("renewed DNS A record; last host fails DNS resolution #slow", function()
-- This test might show some error output similar to the lines below. This is expected and ok.
-- 2017/11/06 15:52:49 [warn] 5123#0: *2 [lua] balancer.lua:320: queryDns(): [ringbalancer] querying dns for really.really.really.does.not.exist.thijsschreijer.nl failed: dns server error: 3 name error, context: ngx.timer
-- 2017/11/06 15:52:49 [warn] 5123#0: *2 [lua] balancer.lua:320: queryDns(): [ringbalancer] querying dns for really.really.really.does.not.exist.hostname.test failed: dns server error: 3 name error, context: ngx.timer

local test_name = "really.really.really.does.not.exist.thijsschreijer.nl"
local test_name = "really.really.really.does.not.exist.hostname.test"
local ttl = 0.1
local staleTtl = 0 -- stale ttl = 0, force lookup upon expiring
local record = dnsA({
Expand Down
4 changes: 2 additions & 2 deletions spec/01-unit/09-balancer_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ for _, consistency in ipairs({"strict", "eventual"}) do
passive_hc.passive.unhealthy.http_failures = 1

UPSTREAMS_FIXTURES = {
[1] = { id = "a", ws_id = ws_id, name = "mashape", slots = 10, healthchecks = hc_defaults, algorithm = "round-robin" },
[1] = { id = "a", ws_id = ws_id, name = "mashape", slots = 10, healthchecks = passive_hc, algorithm = "round-robin" },
[2] = { id = "b", ws_id = ws_id, name = "kong", slots = 10, healthchecks = hc_defaults, algorithm = "round-robin" },
[3] = { id = "c", ws_id = ws_id, name = "gelato", slots = 20, healthchecks = hc_defaults, algorithm = "round-robin" },
[4] = { id = "d", ws_id = ws_id, name = "galileo", slots = 20, healthchecks = hc_defaults, algorithm = "round-robin" },
[5] = { id = "e", ws_id = ws_id, name = "upstream_e", slots = 10, healthchecks = hc_defaults, algorithm = "round-robin" },
[5] = { id = "e", ws_id = ws_id, name = "upstream_e", slots = 10, healthchecks = passive_hc, algorithm = "round-robin" },
[6] = { id = "f", ws_id = ws_id, name = "upstream_f", slots = 10, healthchecks = hc_defaults, algorithm = "round-robin" },
[7] = { id = "hc_" .. consistency, ws_id = ws_id, name = "upstream_hc_" .. consistency, slots = 10, healthchecks = passive_hc, algorithm = "round-robin" },
[8] = { id = "ph", ws_id = ws_id, name = "upstream_ph", slots = 10, healthchecks = passive_hc, algorithm = "round-robin" },
Expand Down
8 changes: 7 additions & 1 deletion spec/02-integration/04-admin_api/15-off_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,13 @@ describe("Admin API #off", function()
upstreams:
- name: "foo"
targets:
- target: 10.20.30.40
- target: 10.20.30.40
healthchecks:
passive:
healthy:
successes: 1
unhealthy:
http_failures: 1
]]

local res = assert(client:send {
Expand Down
Loading

0 comments on commit 4ceda40

Please sign in to comment.