Skip to content

Commit

Permalink
keyring: safely handle missing keys and restore GC (#15092)
Browse files Browse the repository at this point in the history
When replication of a single key fails, the replication loop breaks early and
therefore keys that fall later in the sorting order will never get
replicated. This is particularly a problem for clusters impacted by the bug that
caused #14981 and that were later upgraded; the keys that were never replicated
can now never be replicated, and so we need to handle them safely.

Included in the replication fix:
* Refactor the replication loop so that each key replicated in a function call
  that returns an error, to make the workflow more clear and reduce nesting. Log
  the error and continue.
* Improve stability of keyring replication tests. We no longer block leadership
  on initializing the keyring, so there's a race condition in the keyring tests
  where we can test for the existence of the root key before the keyring has
  been initialize. Change this to an "eventually" test.

But these fixes aren't enough to fix #14981 because they'll end up seeing an
error once a second complaining about the missing key, so we also need to fix
keyring GC so the keys can be removed from the state store. Now we'll store the
key ID used to sign a workload identity in the Allocation, and we'll index the
Allocation table on that so we can track whether any live Allocation was signed
with a particular key ID.
  • Loading branch information
tgross committed Nov 1, 2022
1 parent c725c1b commit 6b2da83
Show file tree
Hide file tree
Showing 13 changed files with 331 additions and 162 deletions.
2 changes: 1 addition & 1 deletion .changelog/15034.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
```release-note:bug
keyring: Removed root key garbage collection to avoid orphaned workload identities
```
```
7 changes: 7 additions & 0 deletions .changelog/15092.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:bug
keyring: Fixed a bug where a missing key would prevent any further replication.
```

```release-note:bug
keyring: Re-enabled keyring garbage collection after fixing a bug where keys would be garbage collected even if they were used to sign a live allocation's workload identity.
```
127 changes: 75 additions & 52 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
case structs.CoreJobGlobalTokenExpiredGC:
return c.expiredACLTokenGC(eval, true)
case structs.CoreJobRootKeyRotateOrGC:
return c.rootKeyRotate(eval)
return c.rootKeyRotateOrGC(eval)
case structs.CoreJobVariablesRekey:
return c.variablesRekey(eval)
case structs.CoreJobForceGC:
Expand Down Expand Up @@ -96,7 +96,9 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
if err := c.expiredACLTokenGC(eval, true); err != nil {
return err
}

if err := c.rootKeyGC(eval); err != nil {
return err
}
// Node GC must occur after the others to ensure the allocations are
// cleared.
return c.nodeGC(eval)
Expand Down Expand Up @@ -893,23 +895,88 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
return c.srv.RPC(structs.ACLDeleteTokensRPCMethod, req, &structs.GenericResponse{})
}

// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
wasRotated, err := c.rootKeyRotate(eval)
if err != nil {
return err
}
if wasRotated {
return nil
}
return c.rootKeyGC(eval)
}

func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {

oldThreshold := c.getThreshold(eval, "root key",
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
if err != nil {
return err
}

for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.Active() || keyMeta.Rekeying() {
continue // never GC the active key or one we're rekeying
}
if keyMeta.CreateIndex > oldThreshold {
continue // don't GC recent keys
}

inUse, err := c.snap.IsRootKeyMetaInUse(keyMeta.KeyID)
if err != nil {
return err
}
if inUse {
continue
}

req := &structs.KeyringDeleteRootKeyRequest{
KeyID: keyMeta.KeyID,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}
if err := c.srv.RPC("Keyring.Delete",
req, &structs.KeyringDeleteRootKeyResponse{}); err != nil {
c.logger.Error("root key delete failed", "error", err)
return err
}
}

return nil
}

// rootKeyRotate checks if the active key is old enough that we need
// to kick off a rotation.
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) (bool, error) {

rotationThreshold := c.getThreshold(eval, "root key",
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)

ws := memdb.NewWatchSet()
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
if err != nil {
return err
return false, err
}
if activeKey == nil {
return nil // no active key
return false, nil // no active key
}
if activeKey.CreateIndex >= rotationThreshold {
return nil // key is too new
return false, nil // key is too new
}

req := &structs.KeyringRotateRootKeyRequest{
Expand All @@ -921,10 +988,10 @@ func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {
if err := c.srv.RPC("Keyring.Rotate",
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
c.logger.Error("root key rotation failed", "error", err)
return err
return false, err
}

return nil
return true, nil
}

// variablesReKey is optionally run after rotating the active
Expand Down Expand Up @@ -958,29 +1025,6 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
return err
}

// we've now rotated all this key's variables, so set its state
keyMeta = keyMeta.Copy()
keyMeta.SetDeprecated()

key, err := c.srv.encrypter.GetKey(keyMeta.KeyID)
if err != nil {
return err
}
req := &structs.KeyringUpdateRootKeyRequest{
RootKey: &structs.RootKey{
Meta: keyMeta,
Key: key,
},
Rekey: false,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL},
}
if err := c.srv.RPC("Keyring.Update",
req, &structs.KeyringUpdateRootKeyResponse{}); err != nil {
c.logger.Error("root key update failed", "error", err)
return err
}
}

return nil
Expand Down Expand Up @@ -1101,24 +1145,3 @@ func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, confi
}
return oldThreshold
}

// getOldestAllocationIndex returns the CreateIndex of the oldest
// non-terminal allocation in the state store
func (c *CoreScheduler) getOldestAllocationIndex() (uint64, error) {
ws := memdb.NewWatchSet()
allocs, err := c.snap.Allocs(ws, state.SortDefault)
if err != nil {
return 0, err
}
for {
raw := allocs.Next()
if raw == nil {
break
}
alloc := raw.(*structs.Allocation)
if !alloc.TerminalStatus() {
return alloc.CreateIndex, nil
}
}
return 0, nil
}
106 changes: 77 additions & 29 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2466,38 +2466,98 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
// reset the time table
srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10)

// active key, will never be GC'd
store := srv.fsm.State()
key0, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, key0, "expected keyring to be bootstapped")
require.NoError(t, err)

// insert an "old" inactive key
key1 := structs.NewRootKeyMeta()
key1.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(600, key1, false))

// insert an "old" and inactive key with a variable that's using it
key2 := structs.NewRootKeyMeta()
key2.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(700, key2, false))

variable := mock.VariableEncrypted()
variable.KeyID = key2.KeyID

setResp := store.VarSet(601, &structs.VarApplyStateRequest{
Op: structs.VarOpSet,
Var: variable,
})
require.NoError(t, setResp.Error)

// insert an "old" key that's inactive but being used by an alloc
key3 := structs.NewRootKeyMeta()
key3.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(800, key3, false))

// insert the allocation using key3
alloc := mock.Alloc()
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.SigningKeyID = key3.KeyID
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc}))

// insert an "old" key that's inactive but being used by an alloc
key4 := structs.NewRootKeyMeta()
key4.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(900, key4, false))

// insert the dead allocation using key4
alloc2 := mock.Alloc()
alloc2.ClientStatus = structs.AllocClientStatusFailed
alloc2.DesiredStatus = structs.AllocDesiredStatusStop
alloc2.SigningKeyID = key4.KeyID
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 950, []*structs.Allocation{alloc2}))

// insert a time table index before the last key
tt := srv.fsm.TimeTable()
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))

// insert a "new" but inactive key
key5 := structs.NewRootKeyMeta()
key5.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(1500, key5, false))

// run the core job
snap, err := store.Snapshot()
require.NoError(t, err)
core := NewCoreScheduler(srv, snap)
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
c := core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))
require.NoError(t, c.rootKeyRotateOrGC(eval))

got, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.Equal(t, got.KeyID, key0.KeyID)
ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "active key should not have been GCd")

// insert a time table index after the key
tt := srv.fsm.TimeTable()
tt.Witness(3000, time.Now().UTC().Add(-1*srv.config.RootKeyRotationThreshold))
key, err = store.RootKeyMetaByID(ws, key1.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old and unused inactive key should have been GCd")

// re-run the core job
snap, err = store.Snapshot()
key, err = store.RootKeyMetaByID(ws, key2.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key should not have been GCd if still in use")

key, err = store.RootKeyMetaByID(ws, key3.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key used to sign a live alloc should not have been GCd")

key, err = store.RootKeyMetaByID(ws, key4.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old key used to sign a terminal alloc should have been GCd")

key, err = store.RootKeyMetaByID(ws, key5.KeyID)
require.NoError(t, err)
core = NewCoreScheduler(srv, snap)
eval = srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 4000)
c = core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))

got, err = store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.NotEqual(t, got.KeyID, key0.KeyID)
require.NotNil(t, key, "new key should not have been GCd")

}

// TestCoreScheduler_VariablesRekey exercises variables rekeying
Expand Down Expand Up @@ -2563,18 +2623,6 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) {
}, time.Second*5, 100*time.Millisecond,
"variable rekey should be complete")

iter, err := store.RootKeyMetas(memdb.NewWatchSet())
require.NoError(t, err)
for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.KeyID != newKeyID {
require.True(t, keyMeta.Deprecated())
}
}
}

func TestCoreScheduler_FailLoop(t *testing.T) {
Expand Down
Loading

0 comments on commit 6b2da83

Please sign in to comment.