Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

keyring: remove root key GC #15034

Merged
merged 1 commit into from
Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/15034.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
keyring: Removed root key garbage collection to avoid orphaned workload identities
```
99 changes: 10 additions & 89 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
case structs.CoreJobGlobalTokenExpiredGC:
return c.expiredACLTokenGC(eval, true)
case structs.CoreJobRootKeyRotateOrGC:
return c.rootKeyRotateOrGC(eval)
return c.rootKeyRotate(eval)
case structs.CoreJobVariablesRekey:
return c.variablesRekey(eval)
case structs.CoreJobForceGC:
Expand Down Expand Up @@ -96,9 +96,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
if err := c.expiredACLTokenGC(eval, true); err != nil {
return err
}
if err := c.rootKeyGC(eval); err != nil {
return err
}

// Node GC must occur after the others to ensure the allocations are
// cleared.
return c.nodeGC(eval)
Expand Down Expand Up @@ -895,100 +893,23 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
return c.srv.RPC(structs.ACLDeleteTokensRPCMethod, req, &structs.GenericResponse{})
}

// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
wasRotated, err := c.rootKeyRotation(eval)
if err != nil {
return err
}
if wasRotated {
return nil
}
return c.rootKeyGC(eval)
}

func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {

// we can't GC any key older than the oldest live allocation
// because it might have signed that allocation's workload
// identity; this is conservative so that we don't have to iterate
// over all the allocations and find out which keys signed their
// identity, which will be expensive on large clusters
allocOldThreshold, err := c.getOldestAllocationIndex()
if err != nil {
return err
}

oldThreshold := c.getThreshold(eval, "root key",
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
if err != nil {
return err
}

for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.Active() {
continue // never GC the active key
}
if keyMeta.CreateIndex > oldThreshold {
continue // don't GC recent keys
}
if keyMeta.CreateIndex > allocOldThreshold {
continue // don't GC keys possibly used to sign live allocations
}
varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID)
if err != nil {
return err
}
if varIter.Next() != nil {
continue // key is still in use
}

req := &structs.KeyringDeleteRootKeyRequest{
KeyID: keyMeta.KeyID,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}
if err := c.srv.RPC("Keyring.Delete",
req, &structs.KeyringDeleteRootKeyResponse{}); err != nil {
c.logger.Error("root key delete failed", "error", err)
return err
}
}

return nil
}

// rootKeyRotation checks if the active key is old enough that we need
// to kick off a rotation. Returns true if the key was rotated.
func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) {
// rootKeyRotate checks if the active key is old enough that we need
// to kick off a rotation.
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {

rotationThreshold := c.getThreshold(eval, "root key",
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)

ws := memdb.NewWatchSet()
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
if err != nil {
return false, err
return err
}
if activeKey == nil {
return false, nil // no active key
return nil // no active key
}
if activeKey.CreateIndex >= rotationThreshold {
return false, nil // key is too new
return nil // key is too new
}

req := &structs.KeyringRotateRootKeyRequest{
Expand All @@ -1000,10 +921,10 @@ func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error)
if err := c.srv.RPC("Keyring.Rotate",
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
c.logger.Error("root key rotation failed", "error", err)
return false, err
return err
}

return true, nil
return nil
}

// variablesReKey is optionally run after rotating the active
Expand Down
74 changes: 17 additions & 57 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2471,73 +2471,33 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
require.NotNil(t, key0, "expected keyring to be bootstapped")
require.NoError(t, err)

// insert an "old" and inactive key
key1 := structs.NewRootKeyMeta()
key1.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(500, key1, false))

// insert an "old" and inactive key with a variable that's using it
key2 := structs.NewRootKeyMeta()
key2.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(600, key2, false))

variable := mock.VariableEncrypted()
variable.KeyID = key2.KeyID

setResp := store.VarSet(601, &structs.VarApplyStateRequest{
Op: structs.VarOpSet,
Var: variable,
})
require.NoError(t, setResp.Error)

// insert an allocation
alloc := mock.Alloc()
alloc.ClientStatus = structs.AllocClientStatusRunning
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 700, []*structs.Allocation{alloc}))

// insert an "old" key that's newer than oldest alloc
key3 := structs.NewRootKeyMeta()
key3.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(750, key3, false))

// insert a time table index before the last key
tt := srv.fsm.TimeTable()
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))

// insert a "new" but inactive key
key4 := structs.NewRootKeyMeta()
key4.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(1500, key4, false))

// run the core job
snap, err := store.Snapshot()
require.NoError(t, err)
core := NewCoreScheduler(srv, snap)
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
c := core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotateOrGC(eval))
require.NoError(t, c.rootKeyRotate(eval))

ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "active key should not have been GCd")
got, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.Equal(t, got.KeyID, key0.KeyID)

key, err = store.RootKeyMetaByID(ws, key1.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old key should have been GCd")

key, err = store.RootKeyMetaByID(ws, key2.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key should not have been GCd if still in use")

key, err = store.RootKeyMetaByID(ws, key3.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key newer than oldest alloc should not have been GCd")
// insert a time table index after the key
tt := srv.fsm.TimeTable()
tt.Witness(3000, time.Now().UTC().Add(-1*srv.config.RootKeyRotationThreshold))

key, err = store.RootKeyMetaByID(ws, key4.KeyID)
// re-run the core job
snap, err = store.Snapshot()
require.NoError(t, err)
require.NotNil(t, key, "new key should not have been GCd")
core = NewCoreScheduler(srv, snap)
eval = srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 4000)
c = core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))

got, err = store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.NotEqual(t, got.KeyID, key0.KeyID)
}

// TestCoreScheduler_VariablesRekey exercises variables rekeying
Expand Down
14 changes: 5 additions & 9 deletions website/content/docs/configuration/server.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,11 @@ server {
rejoin the cluster.

- `root_key_gc_interval` `(string: "10m")` - Specifies the interval between
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I'm intentionally leaving the name here because we'll want to use this interval for GC once we circle back to it, and I don't want to have to add a new interval for that too. Same applies to the name of the core job emitted by the leader.

[encryption key][] metadata garbage collections.

- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time that an
[encryption key][] must exist before it can be eligible for garbage
collection.
checks to rotate the root [encryption key][].

- `root_key_rotation_threshold` `(string: "720h")` - Specifies the minimum time
that an [encryption key][] must exist before it is automatically rotated on
the next garbage collection interval.
the next `root_key_gc_interval`.

- `server_join` <code>([server_join][server-join]: nil)</code> - Specifies
how the Nomad server will connect to other Nomad servers. The `retry_join`
Expand Down Expand Up @@ -392,7 +388,7 @@ regardless of cluster size.
The base formula for determining how often a Client must heartbeat is:

```
<number of Clients> / <max_heartbeats_per_second>
<number of Clients> / <max_heartbeats_per_second>
```

Other factors modify this base TTL:
Expand All @@ -408,7 +404,7 @@ Other factors modify this base TTL:
to successfully heartbeat. This gives Clients time to discover a functioning
Server in case they were directly connected to a leader that crashed.

For example, given the default values for heartbeat parameters, different sized
For example, given the default values for heartbeat parameters, different sized
clusters will use the following TTLs for the heartbeats. Note that the `Server TTL`
simply adds the `heartbeat_grace` parameter to the TTL Clients are given.

Expand All @@ -425,7 +421,7 @@ Regardless of size, all clients will have a Server TTL of
than the maximum Client TTL for your cluster size in order to prevent marking
live Clients as `down`.

For clusters over 5000 Clients you should increase `failover_heartbeat_ttl`
For clusters over 5000 Clients you should increase `failover_heartbeat_ttl`
using the following formula:

```
Expand Down