Skip to content

Commit

Permalink
fingerprint: don't clear Consul/Vault attributes on failure
Browse files Browse the repository at this point in the history
Clients periodically fingerprint Vault and Consul to ensure the server has
updated attributes in the client's fingerprint. If the client can't reach
Vault/Consul, the fingerprinter clears the attributes and requires a node
update. Although this seems like correct behavior so that we can detect
intentional removal of Vault/Consul access, it has two serious failure modes:

(1) If a local Consul agent is restarted to pick up configuration changes and the
client happens to fingerprint at that moment, the client will update its
fingerprint and result in evaluations for all its jobs and all the system jobs
in the cluster.

(2) If a client loses Vault connectivity, the same thing happens. But the
consequences are much worse in the Vault case because Vault is not run as a
local agent, so Vault connectivity failures are highly correlated across the
entire cluster. A 15 second Vault outage will cause a new `node-update`
evalution for every system job on the cluster times the number of nodes, plus
one `node-update` evaluation for every non-system job on each node. On large
clusters of 1000s of nodes, we've seen this create a large backlog of evaluations.

This changeset updates the fingerprinting behavior to keep the last fingerprint
if Consul or Vault queries fail. This prevents a storm of evaluations at the
cost of requiring a client restart if Consul or Vault is intentionally removed
from the client.
  • Loading branch information
tgross committed Sep 23, 2022
1 parent 9c3ea13 commit 4b3db57
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 46 deletions.
3 changes: 3 additions & 0 deletions .changelog/14673.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
fingerprint: Consul and Vault attributes are no longer cleared on fingerprinting failure
```
10 changes: 0 additions & 10 deletions client/fingerprint/consul.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,6 @@ func (f *ConsulFingerprint) Periodic() (bool, time.Duration) {
return true, 15 * time.Second
}

// clearConsulAttributes removes consul attributes and links from the passed Node.
func (f *ConsulFingerprint) clearConsulAttributes(r *FingerprintResponse) {
for attr := range f.extractors {
r.RemoveAttribute(attr)
}
r.RemoveLink("consul")
}

func (f *ConsulFingerprint) initialize(req *FingerprintRequest) error {
// Only create the Consul client once to avoid creating many connections
if f.client == nil {
Expand Down Expand Up @@ -118,8 +110,6 @@ func (f *ConsulFingerprint) query(resp *FingerprintResponse) agentconsul.Self {
// If we can't hit this URL consul is probably not running on this machine.
info, err := f.client.Agent().Self()
if err != nil {
f.clearConsulAttributes(resp)

// indicate consul no longer available
if f.lastState == consulAvailable {
f.logger.Info("consul agent is unavailable")
Expand Down
34 changes: 6 additions & 28 deletions client/fingerprint/consul_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,20 +411,9 @@ func TestConsulFingerprint_Fingerprint_oss(t *testing.T) {

// execute second query with error
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
require.NoError(t, err2) // does not return error
require.Equal(t, map[string]string{ // attributes set empty
"consul.datacenter": "",
"consul.revision": "",
"consul.segment": "",
"consul.server": "",
"consul.sku": "",
"consul.version": "",
"unique.consul.name": "",
"consul.connect": "",
"consul.grpc": "",
"consul.ft.namespaces": "",
}, resp2.Attributes)
require.True(t, resp.Detected) // never downgrade
require.NoError(t, err2) // does not return error
require.Nil(t, resp2.Attributes) // attributes unset so they don't change
require.True(t, resp.Detected) // never downgrade

// consul no longer available
require.Equal(t, consulUnavailable, cf.lastState)
Expand Down Expand Up @@ -501,20 +490,9 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) {

// execute second query with error
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
require.NoError(t, err2) // does not return error
require.Equal(t, map[string]string{ // attributes set empty
"consul.datacenter": "",
"consul.revision": "",
"consul.segment": "",
"consul.server": "",
"consul.sku": "",
"consul.version": "",
"consul.ft.namespaces": "",
"consul.connect": "",
"consul.grpc": "",
"unique.consul.name": "",
}, resp2.Attributes)
require.True(t, resp.Detected) // never downgrade
require.NoError(t, err2) // does not return error
require.Nil(t, resp2.Attributes) // attributes unset so they don't change
require.True(t, resp.Detected) // never downgrade

// consul no longer available
require.Equal(t, consulUnavailable, cf.lastState)
Expand Down
9 changes: 1 addition & 8 deletions client/fingerprint/vault.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
// Connect to vault and parse its information
status, err := f.client.Sys().SealStatus()
if err != nil {
f.clearVaultAttributes(resp)

// Print a message indicating that Vault is not available anymore
if f.lastState == vaultAvailable {
f.logger.Info("Vault is unavailable")
Expand Down Expand Up @@ -80,10 +80,3 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
func (f *VaultFingerprint) Periodic() (bool, time.Duration) {
return true, 15 * time.Second
}

func (f *VaultFingerprint) clearVaultAttributes(r *FingerprintResponse) {
r.RemoveAttribute("vault.accessible")
r.RemoveAttribute("vault.version")
r.RemoveAttribute("vault.cluster_id")
r.RemoveAttribute("vault.cluster_name")
}
17 changes: 17 additions & 0 deletions client/fingerprint/vault_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,21 @@ func TestVaultFingerprint(t *testing.T) {
assertNodeAttributeContains(t, response.Attributes, "vault.version")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")

tv.Stop()

err = fp.Fingerprint(request, &response)
if err != nil {
t.Fatalf("Failed to fingerprint: %s", err)
}

if !response.Detected {
t.Fatalf("should still show as detected")
}

assertNodeAttributeContains(t, response.Attributes, "vault.accessible")
assertNodeAttributeContains(t, response.Attributes, "vault.version")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")

}
4 changes: 4 additions & 0 deletions testutil/vault.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package testutil

import (
"errors"
"fmt"
"math/rand"
"os"
Expand Down Expand Up @@ -215,6 +216,9 @@ func (tv *TestVault) Stop() {
}

if err := tv.cmd.Process.Kill(); err != nil {
if errors.Is(err, os.ErrProcessDone) {
return
}
tv.t.Errorf("err: %s", err)
}
if tv.waitCh != nil {
Expand Down

0 comments on commit 4b3db57

Please sign in to comment.