From 994f0209a17aa614dea7cb3d9695d20b3ed6bb4f Mon Sep 17 00:00:00 2001 From: Shuyang Wu Date: Wed, 30 Jun 2021 09:16:03 -0400 Subject: [PATCH] feat: enable etcd health-check (#4191) --- apisix/cli/ngx_tpl.lua | 1 + apisix/core/config_etcd.lua | 46 ++++++++++++- conf/config-default.yaml | 1 + rockspec/apisix-master-0.rockspec | 2 +- t/APISIX.pm | 1 + t/cli/docker-compose-etcd-cluster.yaml | 69 +++++++++++++++++++ t/cli/test_etcd_healthcheck.sh | 92 ++++++++++++++++++++++++++ t/core/config_etcd.t | 16 ++--- 8 files changed, 217 insertions(+), 11 deletions(-) create mode 100644 t/cli/docker-compose-etcd-cluster.yaml create mode 100755 t/cli/test_etcd_healthcheck.sh diff --git a/apisix/cli/ngx_tpl.lua b/apisix/cli/ngx_tpl.lua index 3e44b53d7bd7..ebe0da1e04a7 100644 --- a/apisix/cli/ngx_tpl.lua +++ b/apisix/cli/ngx_tpl.lua @@ -155,6 +155,7 @@ http { lua_shared_dict plugin-limit-count-redis-cluster-slot-lock 1m; lua_shared_dict tracing_buffer 10m; # plugin: skywalking lua_shared_dict plugin-api-breaker 10m; + lua_shared_dict etcd_cluster_health_check 10m; # etcd health check # for openid-connect and authz-keycloak plugin lua_shared_dict discovery 1m; # cache for discovery metadata documents diff --git a/apisix/core/config_etcd.lua b/apisix/core/config_etcd.lua index a9888a3f4bc3..38a32e0f78dc 100644 --- a/apisix/core/config_etcd.lua +++ b/apisix/core/config_etcd.lua @@ -38,12 +38,15 @@ local tostring = tostring local tonumber = tonumber local xpcall = xpcall local debug = debug +local string = string local error = error local rand = math.random local constants = require("apisix.constants") +local health_check = require("resty.etcd.health_check") local is_http = ngx.config.subsystem == "http" +local err_etcd_unhealthy_all = "has no healthy etcd endpoint available" local created_obj = {} local loaded_configuration = {} @@ -146,7 +149,11 @@ local function waitdir(etcd_cli, key, modified_index, timeout) end if type(res.result) ~= "table" then - return nil, "failed to wait etcd dir" + err = "failed to wait etcd dir" + if res.error and res.error.message then + err = err .. ": " .. res.error.message + end + return nil, err end return etcd_apisix.watch_format(res) end @@ -529,6 +536,18 @@ local function _automatic_fetch(premature, self) return end + if not health_check.conf then + local _, err = health_check.init({ + shm_name = "etcd_cluster_health_check", + fail_timeout = self.health_check_timeout, + max_fails = 3, + retry = true, + }) + if err then + log.warn("fail to create health_check: " .. err) + end + end + local i = 0 while not exiting() and self.running and i <= 32 do i = i + 1 @@ -545,7 +564,25 @@ local function _automatic_fetch(premature, self) local ok, err = sync_data(self) if err then - if err ~= "timeout" and err ~= "Key not found" + if string.find(err, err_etcd_unhealthy_all) then + local reconnected = false + while err and not reconnected and i <= 32 do + local backoff_duration, backoff_factor, backoff_step = 1, 2, 6 + for _ = 1, backoff_step do + i = i + 1 + ngx_sleep(backoff_duration) + _, err = sync_data(self) + if not err or not string.find(err, err_etcd_unhealthy_all) then + log.warn("reconnected to etcd") + reconnected = true + break + end + backoff_duration = backoff_duration * backoff_factor + log.error("no healthy etcd endpoint available, next retry after " + .. backoff_duration .. "s") + end + end + elseif err ~= "timeout" and err ~= "Key not found" and self.last_err ~= err then log.error("failed to fetch data from etcd: ", err, ", ", tostring(self)) @@ -594,6 +631,10 @@ function _M.new(key, opts) if not resync_delay or resync_delay < 0 then resync_delay = 5 end + local health_check_timeout = etcd_conf.health_check_timeout + if not health_check_timeout or health_check_timeout < 0 then + health_check_timeout = 10 + end local automatic = opts and opts.automatic local item_schema = opts and opts.item_schema @@ -618,6 +659,7 @@ function _M.new(key, opts) last_err = nil, last_err_time = nil, resync_delay = resync_delay, + health_check_timeout = health_check_timeout, timeout = timeout, single_item = single_item, filter = filter_fun, diff --git a/conf/config-default.yaml b/conf/config-default.yaml index 3065646d4202..eedf77febdfa 100644 --- a/conf/config-default.yaml +++ b/conf/config-default.yaml @@ -209,6 +209,7 @@ etcd: prefix: "/apisix" # apisix configurations prefix timeout: 30 # 30 seconds #resync_delay: 5 # when sync failed and a rest is needed, resync after the configured seconds plus 50% random jitter + #health_check_timeout: 10 # etcd retry the unhealthy nodes after the configured seconds #user: root # root username for etcd #password: 5tHkHhYkjr6cQY # root password for etcd tls: diff --git a/rockspec/apisix-master-0.rockspec b/rockspec/apisix-master-0.rockspec index f91ba686cb38..f9d73da0224d 100644 --- a/rockspec/apisix-master-0.rockspec +++ b/rockspec/apisix-master-0.rockspec @@ -34,7 +34,7 @@ dependencies = { "lua-resty-ctxdump = 0.1-0", "lua-resty-dns-client = 5.2.0", "lua-resty-template = 2.0", - "lua-resty-etcd = 1.5.0", + "lua-resty-etcd = 1.5.3", "lua-resty-balancer = 0.02rc5", "lua-resty-ngxvar = 0.5.2", "lua-resty-jit-uuid = 0.0.7", diff --git a/t/APISIX.pm b/t/APISIX.pm index c09f6cf78c69..e0c97cd9433d 100644 --- a/t/APISIX.pm +++ b/t/APISIX.pm @@ -430,6 +430,7 @@ _EOC_ lua_shared_dict discovery 1m; # plugin authz-keycloak lua_shared_dict plugin-api-breaker 10m; lua_capture_error_log 1m; # plugin error-log-logger + lua_shared_dict etcd_cluster_health_check 10m; # etcd health check proxy_ssl_name \$upstream_host; proxy_ssl_server_name on; diff --git a/t/cli/docker-compose-etcd-cluster.yaml b/t/cli/docker-compose-etcd-cluster.yaml new file mode 100644 index 000000000000..a2fcef74ae28 --- /dev/null +++ b/t/cli/docker-compose-etcd-cluster.yaml @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +version: "3.7" + +services: + etcd0: + image: "gcr.io/etcd-development/etcd:v3.4.15" + container_name: etcd0 + ports: + - "23800:2380" + - "23790:2379" + environment: + - ALLOW_NONE_AUTHENTICATION=yes + - ETCD_NAME=etcd0 + - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380 + - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 + - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23790 + - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd0:2380 + - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster + - ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380 + - ETCD_INITIAL_CLUSTER_STATE=new + + etcd1: + image: "gcr.io/etcd-development/etcd:v3.4.15" + container_name: etcd1 + ports: + - "23801:2380" + - "23791:2379" + environment: + - ALLOW_NONE_AUTHENTICATION=yes + - ETCD_NAME=etcd1 + - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380 + - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 + - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23791 + - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd1:2380 + - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster + - ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380 + - ETCD_INITIAL_CLUSTER_STATE=new + + etcd2: + image: "gcr.io/etcd-development/etcd:v3.4.15" + container_name: etcd2 + ports: + - "23802:2380" + - "23792:2379" + environment: + - ALLOW_NONE_AUTHENTICATION=yes + - ETCD_NAME=etcd2 + - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380 + - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 + - ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23792 + - ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd2:2380 + - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster + - ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380 + - ETCD_INITIAL_CLUSTER_STATE=new diff --git a/t/cli/test_etcd_healthcheck.sh b/t/cli/test_etcd_healthcheck.sh new file mode 100755 index 000000000000..62498f17f5d8 --- /dev/null +++ b/t/cli/test_etcd_healthcheck.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. ./t/cli/common.sh + +# create 3 node etcd cluster in docker +ETCD_NAME_0=etcd0 +ETCD_NAME_1=etcd1 +ETCD_NAME_2=etcd2 +HEALTH_CHECK_RETRY_TIMEOUT=10 + +echo ' +etcd: + host: + - "http://127.0.0.1:23790" + - "http://127.0.0.1:23791" + - "http://127.0.0.1:23792" + health_check_timeout: '"$HEALTH_CHECK_RETRY_TIMEOUT"' +' > conf/config.yaml + +docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d + +# Check apisix not got effected when one etcd node disconnected +make init && make run + +docker stop ${ETCD_NAME_0} +code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1') +if [ ! $code -eq 200 ]; then + echo "failed: apisix got effect when one etcd node out of a cluster disconnected" + exit 1 +fi +docker start ${ETCD_NAME_0} + +docker stop ${ETCD_NAME_1} +code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1') +if [ ! $code -eq 200 ]; then + echo "failed: apisix got effect when one etcd node out of a cluster disconnected" + exit 1 +fi +docker start ${ETCD_NAME_1} + +make stop + +echo "passed: apisix not got effected when one etcd node disconnected" + +# Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected +make init && make run + +docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop ${ETCD_NAME_2} + +sleep_till=$(date +%s -d "$DATE + $HEALTH_CHECK_RETRY_TIMEOUT second") + +code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1') +if [ $code -eq 200 ]; then + echo "failed: apisix not got effect when all etcd nodes disconnected" + exit 1 +fi + +docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start ${ETCD_NAME_2} + +# sleep till etcd health check try to check again +current_time=$(date +%s) +sleep_seconds=$(( $sleep_till - $current_time )) +if [ "$sleep_seconds" -gt 0 ]; then + sleep $sleep_seconds +fi + +code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1') +if [ ! $code -eq 200 ]; then + echo "failed: apisix could not recover when etcd node recover" + exit 1 +fi + +make stop + +echo "passed: when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected" diff --git a/t/core/config_etcd.t b/t/core/config_etcd.t index fbffc9f3d785..edab98fc5df7 100644 --- a/t/core/config_etcd.t +++ b/t/core/config_etcd.t @@ -44,9 +44,9 @@ etcd: --- request GET /t --- grep_error_log eval -qr{failed to fetch data from etcd: connection refused, etcd key: .*routes} +qr{connection refused} --- grep_error_log_out eval -qr/(failed to fetch data from etcd: connection refused, etcd key: .*routes\n){1,}/ +qr/(connection refused){1,}/ @@ -68,9 +68,9 @@ etcd: --- request GET /t --- grep_error_log chop -failed to fetch data from etcd: handshake failed +handshake failed --- grep_error_log_out eval -qr/(failed to fetch data from etcd: handshake failed){1,}/ +qr/(handshake failed){1,}/ @@ -92,9 +92,9 @@ etcd: --- request GET /t --- grep_error_log chop -failed to fetch data from etcd: closed +closed --- grep_error_log_out eval -qr/(failed to fetch data from etcd: closed){1,}/ +qr/(closed){1,}/ @@ -116,9 +116,9 @@ etcd: --- request GET /t --- grep_error_log chop -failed to fetch data from etcd: 18: self signed certificate +18: self signed certificate --- grep_error_log_out eval -qr/(failed to fetch data from etcd: 18: self signed certificate){1,}/ +qr/(18: self signed certificate){1,}/