From c03cd0e8ca7a8e4632178067dfb1a2a258ea9b85 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 3 Nov 2022 18:51:37 -0400 Subject: [PATCH 1/4] client: retry RPC call when no server is available When a Nomad service starts it tries to establish a connection with servers, but it also runs alloc runners to manage whatever allocations it needs to run. The alloc runner will invoke several hooks to perform actions, with some of them requiring access to the Nomad servers, such as Native Service Discovery Registration. If the alloc runner starts before a connection is established the alloc runner will fail, causing the allocation to be shutdown. This is particularly problematic for disconnected allocations that are reconnecting, as they may fail as soon as the client reconnects. This commit changes the RPC request logic to retry it, using the existing retry mechanism, if there are no servers available. --- client/rpc.go | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/client/rpc.go b/client/rpc.go index 9bc439ad03de..836f7366a88d 100644 --- a/client/rpc.go +++ b/client/rpc.go @@ -70,34 +70,39 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { } TRY: + var rpcErr error + server := c.servers.FindServer() if server == nil { - return noServersErr + rpcErr = noServersErr } - // Make the request. - rpcErr := c.connPool.RPC(c.Region(), server.Addr, method, args, reply) + if server != nil { + // Make the request. + rpcErr = c.connPool.RPC(c.Region(), server.Addr, method, args, reply) - if rpcErr == nil { - c.fireRpcRetryWatcher() - return nil - } + if rpcErr == nil { + c.fireRpcRetryWatcher() + return nil + } - // If shutting down, exit without logging the error - select { - case <-c.shutdownCh: - return nil - default: - } + // If shutting down, exit without logging the error + select { + case <-c.shutdownCh: + return nil + default: + } - // Move off to another server, and see if we can retry. - c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr) - c.servers.NotifyFailedServer(server) + // Move off to another server, and see if we can retry. + c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr) + c.servers.NotifyFailedServer(server) - if !canRetry(args, rpcErr) { - c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr) - return rpcErr + if !canRetry(args, rpcErr) { + c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr) + return rpcErr + } } + if time.Now().After(deadline) { // Blocking queries are tricky. jitters and rpcholdtimes in multiple places can result in our server call taking longer than we wanted it to. For example: // a block time of 5s may easily turn into the server blocking for 10s since it applies its own RPCHoldTime. If the server dies at t=7s we still want to retry From 7f2ae1d3c4138ba43239693cf95a54a61bd98d5f Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 3 Nov 2022 19:14:18 -0400 Subject: [PATCH 2/4] changelog: add entry for #15140 --- .changelog/15140.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/15140.txt diff --git a/.changelog/15140.txt b/.changelog/15140.txt new file mode 100644 index 000000000000..d6ffc32110d1 --- /dev/null +++ b/.changelog/15140.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: prevent allocations from failing on client reconnect by retrying RPC requests when no servers are available yet +``` From 5abaf00ee32a867e6cba4b2a2b5bce3614c8f82a Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Fri, 4 Nov 2022 12:39:45 -0400 Subject: [PATCH 3/4] else --- client/rpc.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/client/rpc.go b/client/rpc.go index 836f7366a88d..7d63bdcd71e1 100644 --- a/client/rpc.go +++ b/client/rpc.go @@ -75,9 +75,7 @@ TRY: server := c.servers.FindServer() if server == nil { rpcErr = noServersErr - } - - if server != nil { + } else { // Make the request. rpcErr = c.connPool.RPC(c.Region(), server.Addr, method, args, reply) From 58b0f34cd2deadee0e1121dec56e6ded92be9c2b Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Fri, 4 Nov 2022 13:12:45 -0400 Subject: [PATCH 4/4] client: prevent nil access to server --- client/rpc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/rpc.go b/client/rpc.go index 7d63bdcd71e1..54e2edec5691 100644 --- a/client/rpc.go +++ b/client/rpc.go @@ -109,7 +109,7 @@ TRY: info.SetTimeToBlock(0) return c.RPC(method, args, reply) } - c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method, "server", server.Addr) + c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method) return rpcErr }