From 2e1978eb1fa274a24432f4b07076fe58789c83fc Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Sat, 29 Jun 2019 04:17:35 -0500 Subject: [PATCH] client: defensive against getting stale alloc updates When fetching node alloc assignments, be defensive against a stale read before killing local nodes allocs. The bug is when both client and servers are restarting and the client requests the node allocation for the node, it might get stale data as server hasn't finished applying all the restored raft transaction to store. Consequently, client would kill and destroy the alloc locally, just to fetch it again moments later when server store is up to date. The bug can be reproduced quite reliably with single node setup (configured with persistence). I suspect it's too edge-casey to occur in production cluster with multiple servers, but we may need to examine leader failover scenarios more closely. In this commit, we only remove and destroy allocs if the removal index is more recent than the alloc index. This seems like a cheap resiliency fix we already use for detecting alloc updates. A more proper fix would be to ensure that a nomad server only serves RPC calls when state store is fully restored or up to date in leadership transition cases. --- client/client.go | 4 ++++ client/util.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index f16a29562abb..f99c2f330ac2 100644 --- a/client/client.go +++ b/client/client.go @@ -1767,6 +1767,9 @@ func (c *Client) allocSync() { // allocUpdates holds the results of receiving updated allocations from the // servers. type allocUpdates struct { + // index is index of server store snapshot used for fetching alloc status + index uint64 + // pulled is the set of allocations that were downloaded from the servers. pulled map[string]*structs.Allocation @@ -1944,6 +1947,7 @@ OUTER: filtered: filtered, pulled: pulledAllocs, migrateTokens: resp.MigrateTokens, + index: resp.Index, } select { diff --git a/client/util.go b/client/util.go index af3bd75400b5..b8690f647c63 100644 --- a/client/util.go +++ b/client/util.go @@ -33,7 +33,7 @@ func diffAllocs(existing map[string]uint64, allocs *allocUpdates) *diffResult { _, filtered := allocs.filtered[existID] // If not updated or filtered, removed - if !pulled && !filtered { + if !pulled && !filtered && allocs.index > existIndex { result.removed = append(result.removed, existID) continue }