From 4510f6d7c1f9537ce25f0d15c4b00719e71b8296 Mon Sep 17 00:00:00 2001 From: Charlie Voiselle <464492+angrycub@users.noreply.github.com> Date: Thu, 25 May 2023 16:04:54 -0400 Subject: [PATCH] [core] nil check and error handling for client status in heartbeat responses (#17316) Add a nil check to constructNodeServerInfoResponse to manage an apparent race between deregister and client heartbeats. Fixes #17310 --- .changelog/17316.txt | 3 +++ client/client.go | 3 ++- nomad/node_endpoint.go | 22 ++++++++++++++++++---- nomad/node_endpoint_test.go | 20 ++++++++++++++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 .changelog/17316.txt diff --git a/.changelog/17316.txt b/.changelog/17316.txt new file mode 100644 index 000000000000..f0f16afcaa98 --- /dev/null +++ b/.changelog/17316.txt @@ -0,0 +1,3 @@ +```release-note:bug +core: Fix panic around client deregistration and pending heartbeats +``` diff --git a/client/client.go b/client/client.go index eb8fe440b779..daf7512a675b 100644 --- a/client/client.go +++ b/client/client.go @@ -2024,7 +2024,8 @@ func (c *Client) updateNodeStatus() error { } // Check heartbeat response for information about the server-side scheduling - // state of this node + // state of this node. If there are errors on the server side, this will come + // back as an empty string. c.UpdateConfig(func(c *config.Config) { if resp.SchedulingEligibility != "" { c.Node.SchedulingEligibility = resp.SchedulingEligibility diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 37673bae4c2b..e2ce4156fbed 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -286,7 +286,7 @@ func equalDevices(n1, n2 *structs.Node) bool { return reflect.DeepEqual(n1.NodeResources.Devices, n2.NodeResources.Devices) } -// updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading. +// constructNodeServerInfoResponse assumes the n.srv.peerLock is held for reading. func (n *Node) constructNodeServerInfoResponse(nodeID string, snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { reply.LeaderRPCAddr = string(n.srv.raft.Leader()) @@ -300,16 +300,30 @@ func (n *Node) constructNodeServerInfoResponse(nodeID string, snap *state.StateS }) } + ws := memdb.NewWatchSet() + // Add ClientStatus information to heartbeat response. - node, _ := snap.NodeByID(nil, nodeID) - reply.SchedulingEligibility = node.SchedulingEligibility + if node, err := snap.NodeByID(ws, nodeID); err == nil && node != nil { + reply.SchedulingEligibility = node.SchedulingEligibility + } else if node == nil { + + // If the node is not found, leave reply.SchedulingEligibility as + // the empty string. The response handler in the client treats this + // as a no-op. As there is no call to action for an operator, log it + // at debug level. + n.logger.Debug("constructNodeServerInfoResponse: node not found", + "node_id", nodeID) + } else { + + // This case is likely only reached via a code error in state store + return err + } // TODO(sean@): Use an indexed node count instead // // Snapshot is used only to iterate over all nodes to create a node // count to send back to Nomad Clients in their heartbeat so Clients // can estimate the size of the cluster. - ws := memdb.NewWatchSet() iter, err := snap.Nodes(ws) if err == nil { for { diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 68f4ea5e070f..8dd1388ef9aa 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -4434,3 +4434,23 @@ func TestNode_List_PaginationFiltering(t *testing.T) { }) } } + +func TestNode_constructNodeServerInfoResponse_MissingNode(t *testing.T) { + ci.Parallel(t) + + s, cleanup := TestServer(t, nil) + defer cleanup() + testutil.WaitForLeader(t, s.RPC) + + // Create a node that isn't a member of the state to force a not found + node := mock.Node() + var reply structs.NodeUpdateResponse + + nE := NewNodeEndpoint(s, nil) + snap, err := s.State().Snapshot() + must.NoError(t, err) + + // call constructNodeServerInfoResponse. Before GH #17316 this would panic + require.NoError(t, nE.constructNodeServerInfoResponse(node.ID, snap, &reply)) + must.NotNil(t, &reply) +}