Skip to content

Commit

Permalink
fingerprint: lengthen Vault check after seen (#14693)
Browse files Browse the repository at this point in the history
Extension of #14673

Once Vault is initially fingerprinted, extend the period since changes
should be infrequent and the fingerprint is relatively expensive since
it is contacting a central Vault server.

Also move the period timer reset *after* the fingerprint. This is
similar to #9435 where the idea is to ensure the retry period starts
*after* the operation is attempted. 15s will be the *minimum* time
between fingerprints now instead of the *maximum* time between
fingerprints.

In the case of Vault fingerprinting, the original behavior might cause
the following:

1. Timer is reset to 15s
2. Fingerprint takes 16s
3. Timer has already elapsed so we immediately Fingerprint again

Even if fingerprinting Vault only takes a few seconds, that may very
well be due to excessive load and backing off our fingerprints is
desirable. The new bevahior ensures we always wait at least 15s between
fingerprint attempts and should allow some natural jittering based on
server load and network latency.
  • Loading branch information
schmichael committed Sep 26, 2022
1 parent 5457239 commit d677b48
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 7 deletions.
7 changes: 7 additions & 0 deletions client/fingerprint/vault.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper"
vapi "github.com/hashicorp/vault/api"
)

Expand Down Expand Up @@ -78,5 +79,11 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
}

func (f *VaultFingerprint) Periodic() (bool, time.Duration) {
if f.lastState == vaultAvailable {
// Fingerprint infrequently once Vault is initially discovered with wide
// jitter to avoid thundering herds of fingerprints against central Vault
// servers.
return true, (30 * time.Second) + helper.RandomStagger(90*time.Second)
}
return true, 15 * time.Second
}
27 changes: 27 additions & 0 deletions client/fingerprint/vault_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package fingerprint

import (
"testing"
"time"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/config"
Expand All @@ -21,6 +22,14 @@ func TestVaultFingerprint(t *testing.T) {
Attributes: make(map[string]string),
}

p, period := fp.Periodic()
if !p {
t.Fatalf("expected fingerprint to be periodic")
}
if period != (15 * time.Second) {
t.Fatalf("expected period to be 15s but found: %s", period)
}

conf := config.DefaultConfig()
conf.VaultConfig = tv.Config

Expand All @@ -40,6 +49,16 @@ func TestVaultFingerprint(t *testing.T) {
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")

// Period should be longer after initial discovery
p, period = fp.Periodic()
if !p {
t.Fatalf("expected fingerprint to be periodic")
}
if period < (30*time.Second) || period > (2*time.Minute) {
t.Fatalf("expected period to be between 30s and 2m but found: %s", period)
}

// Stop Vault to simulate it being unavailable
tv.Stop()

err = fp.Fingerprint(request, &response)
Expand All @@ -56,4 +75,12 @@ func TestVaultFingerprint(t *testing.T) {
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")

// Period should be original once trying to discover Vault is available again
p, period = fp.Periodic()
if !p {
t.Fatalf("expected fingerprint to be periodic")
}
if period != (15 * time.Second) {
t.Fatalf("expected period to be 15s but found: %s", period)
}
}
13 changes: 7 additions & 6 deletions client/fingerprint_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error {
appliedFingerprints = append(appliedFingerprints, name)
}

p, period := f.Periodic()
p, _ := f.Periodic()
if p {
go fm.runFingerprint(f, period, name)
go fm.runFingerprint(f, name)
}

if rfp, ok := f.(fingerprint.ReloadableFingerprint); ok {
Expand All @@ -157,23 +157,24 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error {
}

// runFingerprint runs each fingerprinter individually on an ongoing basis
func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period time.Duration, name string) {
fm.logger.Debug("fingerprinting periodically", "fingerprinter", name, "period", period)
func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, name string) {
_, period := f.Periodic()
fm.logger.Debug("fingerprinting periodically", "fingerprinter", name, "initial_period", period)

timer := time.NewTimer(period)
defer timer.Stop()

for {
select {
case <-timer.C:
timer.Reset(period)

_, err := fm.fingerprint(name, f)
if err != nil {
fm.logger.Debug("error periodic fingerprinting", "error", err, "fingerprinter", name)
continue
}

_, period = f.Periodic()
timer.Reset(period)
case <-fm.shutdownCh:
return
}
Expand Down
2 changes: 1 addition & 1 deletion website/content/docs/upgrade/upgrade-specific.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ the `-json` flag is provided.

Nomad clients no longer have their Consul and Vault fingerprints cleared when
connectivity is lost with Consul and Vault. To intentionally remove Consul and
Vault from a client node, you will need to restart the client.
Vault from a client node, you will need to restart the Nomad client agent.

## Nomad 1.3.3

Expand Down

0 comments on commit d677b48

Please sign in to comment.