Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

keyring: safely handle missing keys and restore GC #15092

Merged
merged 11 commits into from
Nov 1, 2022
2 changes: 1 addition & 1 deletion .changelog/15034.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
```release-note:bug
keyring: Removed root key garbage collection to avoid orphaned workload identities
```
```
7 changes: 7 additions & 0 deletions .changelog/15092.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:bug
keyring: Fixed a bug where a missing key would prevent any further replication.
```

```release-note:bug
keyring: Re-enabled keyring garbage collection after fixing a bug where keys would be garbage collected even if they were used to sign a live allocation's workload identity.
```
127 changes: 75 additions & 52 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
case structs.CoreJobGlobalTokenExpiredGC:
return c.expiredACLTokenGC(eval, true)
case structs.CoreJobRootKeyRotateOrGC:
return c.rootKeyRotate(eval)
return c.rootKeyRotateOrGC(eval)
case structs.CoreJobVariablesRekey:
return c.variablesRekey(eval)
case structs.CoreJobForceGC:
Expand Down Expand Up @@ -96,7 +96,9 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
if err := c.expiredACLTokenGC(eval, true); err != nil {
return err
}

if err := c.rootKeyGC(eval); err != nil {
return err
}
// Node GC must occur after the others to ensure the allocations are
// cleared.
return c.nodeGC(eval)
Expand Down Expand Up @@ -893,23 +895,88 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
return c.srv.RPC(structs.ACLDeleteTokensRPCMethod, req, &structs.GenericResponse{})
}

// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
wasRotated, err := c.rootKeyRotate(eval)
if err != nil {
return err
}
if wasRotated {
return nil
}
return c.rootKeyGC(eval)
}

func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {

oldThreshold := c.getThreshold(eval, "root key",
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
if err != nil {
return err
}

for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.Active() || keyMeta.Rekeying() {
continue // never GC the active key or one we're rekeying
}
if keyMeta.CreateIndex > oldThreshold {
continue // don't GC recent keys
}

inUse, err := c.snap.IsRootKeyMetaInUse(keyMeta.KeyID)
if err != nil {
return err
}
if inUse {
continue
}

req := &structs.KeyringDeleteRootKeyRequest{
KeyID: keyMeta.KeyID,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}
if err := c.srv.RPC("Keyring.Delete",
req, &structs.KeyringDeleteRootKeyResponse{}); err != nil {
c.logger.Error("root key delete failed", "error", err)
return err
}
}

return nil
}

// rootKeyRotate checks if the active key is old enough that we need
// to kick off a rotation.
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) (bool, error) {

rotationThreshold := c.getThreshold(eval, "root key",
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)

ws := memdb.NewWatchSet()
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
if err != nil {
return err
return false, err
}
if activeKey == nil {
return nil // no active key
return false, nil // no active key
}
if activeKey.CreateIndex >= rotationThreshold {
return nil // key is too new
return false, nil // key is too new
}

req := &structs.KeyringRotateRootKeyRequest{
Expand All @@ -921,10 +988,10 @@ func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {
if err := c.srv.RPC("Keyring.Rotate",
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
c.logger.Error("root key rotation failed", "error", err)
return err
return false, err
}

return nil
return true, nil
}

// variablesReKey is optionally run after rotating the active
Expand Down Expand Up @@ -958,29 +1025,6 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
return err
}

// we've now rotated all this key's variables, so set its state
keyMeta = keyMeta.Copy()
keyMeta.SetDeprecated()

key, err := c.srv.encrypter.GetKey(keyMeta.KeyID)
if err != nil {
return err
}
req := &structs.KeyringUpdateRootKeyRequest{
RootKey: &structs.RootKey{
Meta: keyMeta,
Key: key,
},
Rekey: false,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL},
}
if err := c.srv.RPC("Keyring.Update",
req, &structs.KeyringUpdateRootKeyResponse{}); err != nil {
c.logger.Error("root key update failed", "error", err)
return err
}
}

return nil
Expand Down Expand Up @@ -1101,24 +1145,3 @@ func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, confi
}
return oldThreshold
}

// getOldestAllocationIndex returns the CreateIndex of the oldest
// non-terminal allocation in the state store
func (c *CoreScheduler) getOldestAllocationIndex() (uint64, error) {
ws := memdb.NewWatchSet()
allocs, err := c.snap.Allocs(ws, state.SortDefault)
if err != nil {
return 0, err
}
for {
raw := allocs.Next()
if raw == nil {
break
}
alloc := raw.(*structs.Allocation)
if !alloc.TerminalStatus() {
return alloc.CreateIndex, nil
}
}
return 0, nil
}
106 changes: 77 additions & 29 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2466,38 +2466,98 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
// reset the time table
srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10)

// active key, will never be GC'd
store := srv.fsm.State()
key0, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, key0, "expected keyring to be bootstapped")
require.NoError(t, err)

// insert an "old" inactive key
key1 := structs.NewRootKeyMeta()
key1.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(600, key1, false))

// insert an "old" and inactive key with a variable that's using it
key2 := structs.NewRootKeyMeta()
key2.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(700, key2, false))

variable := mock.VariableEncrypted()
variable.KeyID = key2.KeyID

setResp := store.VarSet(601, &structs.VarApplyStateRequest{
Op: structs.VarOpSet,
Var: variable,
})
require.NoError(t, setResp.Error)

// insert an "old" key that's inactive but being used by an alloc
key3 := structs.NewRootKeyMeta()
key3.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(800, key3, false))

// insert the allocation using key3
alloc := mock.Alloc()
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.SigningKeyID = key3.KeyID
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc}))

// insert an "old" key that's inactive but being used by an alloc
key4 := structs.NewRootKeyMeta()
key4.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(900, key4, false))

// insert the dead allocation using key4
alloc2 := mock.Alloc()
alloc2.ClientStatus = structs.AllocClientStatusFailed
alloc2.DesiredStatus = structs.AllocDesiredStatusStop
alloc2.SigningKeyID = key4.KeyID
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 950, []*structs.Allocation{alloc2}))

// insert a time table index before the last key
tt := srv.fsm.TimeTable()
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))

// insert a "new" but inactive key
key5 := structs.NewRootKeyMeta()
key5.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(1500, key5, false))

// run the core job
snap, err := store.Snapshot()
require.NoError(t, err)
core := NewCoreScheduler(srv, snap)
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
c := core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))
require.NoError(t, c.rootKeyRotateOrGC(eval))

got, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.Equal(t, got.KeyID, key0.KeyID)
ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "active key should not have been GCd")

// insert a time table index after the key
tt := srv.fsm.TimeTable()
tt.Witness(3000, time.Now().UTC().Add(-1*srv.config.RootKeyRotationThreshold))
key, err = store.RootKeyMetaByID(ws, key1.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old and unused inactive key should have been GCd")

// re-run the core job
snap, err = store.Snapshot()
key, err = store.RootKeyMetaByID(ws, key2.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key should not have been GCd if still in use")

key, err = store.RootKeyMetaByID(ws, key3.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key used to sign a live alloc should not have been GCd")

key, err = store.RootKeyMetaByID(ws, key4.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old key used to sign a terminal alloc should have been GCd")

key, err = store.RootKeyMetaByID(ws, key5.KeyID)
require.NoError(t, err)
core = NewCoreScheduler(srv, snap)
eval = srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 4000)
c = core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))

got, err = store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.NotEqual(t, got.KeyID, key0.KeyID)
require.NotNil(t, key, "new key should not have been GCd")

}

// TestCoreScheduler_VariablesRekey exercises variables rekeying
Expand Down Expand Up @@ -2563,18 +2623,6 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) {
}, time.Second*5, 100*time.Millisecond,
"variable rekey should be complete")

iter, err := store.RootKeyMetas(memdb.NewWatchSet())
require.NoError(t, err)
for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.KeyID != newKeyID {
require.True(t, keyMeta.Deprecated())
}
}
}

func TestCoreScheduler_FailLoop(t *testing.T) {
Expand Down
Loading