From 6efb949516f2be6f2ead928be050c22fbc8a9086 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 19 Apr 2019 09:00:24 -0400 Subject: [PATCH] client: wait for batched driver updated Here we retain 0.8.7 behavior of waiting for driver fingerprints before registering a node, with some timeout. This is needed for system jobs, as system job scheduling for node occur at node registration, and the race might mean that a system job may not get placed on the node because of missing drivers. The timeout isn't strictly necessary, but raising it to 1 minute as it's closer to indefinitely blocked than 1 second. We need to keep the value high enough to capture as much drivers/devices, but low enough that doesn't risk blocking too long due to misbehaving plugin. Fixes https://github.com/hashicorp/nomad/issues/5579 --- client/client.go | 12 ++++++++++++ client/node_updater.go | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index f50e4a0a8997..6a9e55b3e719 100644 --- a/client/client.go +++ b/client/client.go @@ -93,6 +93,11 @@ const ( allocSyncRetryIntv = 5 * time.Second ) +var ( + // grace period to allow for batch fingerprint processing + batchFirstFingerprintsProcessingGrace = batchFirstFingerprintsTimeout + 5*time.Second +) + // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad // Client type ClientStatsReporter interface { @@ -416,6 +421,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic return nil, fmt.Errorf("failed to setup vault client: %v", err) } + // wait until drivers are healthy before restoring or registering with servers + select { + case <-c.Ready(): + case <-time.After(batchFirstFingerprintsProcessingGrace): + logger.Warn("batched fingerprint, registering node with registered so far") + } + // Restore the state if err := c.restoreState(); err != nil { logger.Error("failed to restore state", "error", err) diff --git a/client/node_updater.go b/client/node_updater.go index 4367eeb4f7e1..702cfe8c2988 100644 --- a/client/node_updater.go +++ b/client/node_updater.go @@ -14,7 +14,7 @@ import ( var ( // batchFirstFingerprintsTimeout is the maximum amount of time to wait for // initial fingerprinting to complete before sending a batched Node update - batchFirstFingerprintsTimeout = 5 * time.Second + batchFirstFingerprintsTimeout = 50 * time.Second ) // batchFirstFingerprints waits for the first fingerprint response from all