diff --git a/client/client.go b/client/client.go index d2c12b05dafe..6b66bd5a31b5 100644 --- a/client/client.go +++ b/client/client.go @@ -93,6 +93,11 @@ const ( allocSyncRetryIntv = 5 * time.Second ) +var ( + // grace period to allow for batch fingerprint processing + batchFirstFingerprintsProcessingGrace = batchFirstFingerprintsTimeout + 5*time.Second +) + // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad // Client type ClientStatsReporter interface { @@ -419,6 +424,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic return nil, fmt.Errorf("failed to setup vault client: %v", err) } + // wait until drivers are healthy before restoring or registering with servers + select { + case <-c.Ready(): + case <-time.After(batchFirstFingerprintsProcessingGrace): + logger.Warn("batch fingerprint operation timed out; proceeding to register with fingerprinted plugins so far") + } + // Restore the state if err := c.restoreState(); err != nil { logger.Error("failed to restore state", "error", err) @@ -1456,13 +1468,7 @@ func (c *Client) watchNodeEvents() { // batchEvents stores events that have yet to be published var batchEvents []*structs.NodeEvent - // Create and drain the timer - timer := time.NewTimer(0) - timer.Stop() - select { - case <-timer.C: - default: - } + timer := stoppedTimer() defer timer.Stop() for { @@ -1918,7 +1924,8 @@ func (c *Client) updateNodeLocked() { // it will update the client node copy and re-register the node. func (c *Client) watchNodeUpdates() { var hasChanged bool - timer := time.NewTimer(c.retryIntv(nodeUpdateRetryIntv)) + + timer := stoppedTimer() defer timer.Stop() for { diff --git a/client/node_updater.go b/client/node_updater.go index 4367eeb4f7e1..702cfe8c2988 100644 --- a/client/node_updater.go +++ b/client/node_updater.go @@ -14,7 +14,7 @@ import ( var ( // batchFirstFingerprintsTimeout is the maximum amount of time to wait for // initial fingerprinting to complete before sending a batched Node update - batchFirstFingerprintsTimeout = 5 * time.Second + batchFirstFingerprintsTimeout = 50 * time.Second ) // batchFirstFingerprints waits for the first fingerprint response from all diff --git a/client/pluginmanager/drivermanager/manager.go b/client/pluginmanager/drivermanager/manager.go index 6592a2a80b0d..2c0474eea919 100644 --- a/client/pluginmanager/drivermanager/manager.go +++ b/client/pluginmanager/drivermanager/manager.go @@ -243,9 +243,18 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca } var mu sync.Mutex - var availDrivers []string + driversByStatus := map[drivers.HealthState][]string{} + var wg sync.WaitGroup + recordDriver := func(name string, lastHeath drivers.HealthState) { + mu.Lock() + defer mu.Unlock() + + updated := append(driversByStatus[lastHeath], name) + driversByStatus[lastHeath] = updated + } + // loop through instances and wait for each to finish initial fingerprint m.instancesMu.RLock() for n, i := range m.instances { @@ -253,16 +262,13 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca go func(name string, instance *instanceManager) { defer wg.Done() instance.WaitForFirstFingerprint(ctx) - if instance.getLastHealth() != drivers.HealthStateUndetected { - mu.Lock() - availDrivers = append(availDrivers, name) - mu.Unlock() - } + recordDriver(name, instance.getLastHealth()) }(n, i) } m.instancesMu.RUnlock() wg.Wait() - m.logger.Debug("detected drivers", "drivers", availDrivers) + + m.logger.Debug("detected drivers", "drivers", driversByStatus) } func (m *manager) loadReattachConfigs() error { diff --git a/client/util.go b/client/util.go index e3662a303eb1..af3bd75400b5 100644 --- a/client/util.go +++ b/client/util.go @@ -3,6 +3,7 @@ package client import ( "fmt" "math/rand" + "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -63,3 +64,13 @@ func shuffleStrings(list []string) { list[i], list[j] = list[j], list[i] } } + +// stoppedTimer returns a timer that's stopped and wouldn't fire until +// it's reset +func stoppedTimer() *time.Timer { + timer := time.NewTimer(0) + if !timer.Stop() { + <-timer.C + } + return timer +}