diff --git a/.changelog/15034.txt b/.changelog/15034.txt new file mode 100644 index 000000000000..c418f18b8f77 --- /dev/null +++ b/.changelog/15034.txt @@ -0,0 +1,3 @@ +```release-note:bug +keyring: Removed root key garbage collection to avoid orphaned workload identities +``` diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 321da820261f..8abd1c5407d8 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -60,7 +60,7 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error { case structs.CoreJobGlobalTokenExpiredGC: return c.expiredACLTokenGC(eval, true) case structs.CoreJobRootKeyRotateOrGC: - return c.rootKeyRotateOrGC(eval) + return c.rootKeyRotate(eval) case structs.CoreJobVariablesRekey: return c.variablesRekey(eval) case structs.CoreJobForceGC: @@ -96,9 +96,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { if err := c.expiredACLTokenGC(eval, true); err != nil { return err } - if err := c.rootKeyGC(eval); err != nil { - return err - } + // Node GC must occur after the others to ensure the allocations are // cleared. return c.nodeGC(eval) @@ -895,86 +893,9 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool) return c.srv.RPC(structs.ACLDeleteTokensRPCMethod, req, &structs.GenericResponse{}) } -// rootKeyRotateOrGC is used to rotate or garbage collect root keys -func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error { - - // a rotation will be sent to the leader so our view of state - // is no longer valid. we ack this core job and will pick up - // the GC work on the next interval - wasRotated, err := c.rootKeyRotation(eval) - if err != nil { - return err - } - if wasRotated { - return nil - } - return c.rootKeyGC(eval) -} - -func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { - - // we can't GC any key older than the oldest live allocation - // because it might have signed that allocation's workload - // identity; this is conservative so that we don't have to iterate - // over all the allocations and find out which keys signed their - // identity, which will be expensive on large clusters - allocOldThreshold, err := c.getOldestAllocationIndex() - if err != nil { - return err - } - - oldThreshold := c.getThreshold(eval, "root key", - "root_key_gc_threshold", c.srv.config.RootKeyGCThreshold) - - ws := memdb.NewWatchSet() - iter, err := c.snap.RootKeyMetas(ws) - if err != nil { - return err - } - - for { - raw := iter.Next() - if raw == nil { - break - } - keyMeta := raw.(*structs.RootKeyMeta) - if keyMeta.Active() { - continue // never GC the active key - } - if keyMeta.CreateIndex > oldThreshold { - continue // don't GC recent keys - } - if keyMeta.CreateIndex > allocOldThreshold { - continue // don't GC keys possibly used to sign live allocations - } - varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID) - if err != nil { - return err - } - if varIter.Next() != nil { - continue // key is still in use - } - - req := &structs.KeyringDeleteRootKeyRequest{ - KeyID: keyMeta.KeyID, - WriteRequest: structs.WriteRequest{ - Region: c.srv.config.Region, - AuthToken: eval.LeaderACL, - }, - } - if err := c.srv.RPC("Keyring.Delete", - req, &structs.KeyringDeleteRootKeyResponse{}); err != nil { - c.logger.Error("root key delete failed", "error", err) - return err - } - } - - return nil -} - -// rootKeyRotation checks if the active key is old enough that we need -// to kick off a rotation. Returns true if the key was rotated. -func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) { +// rootKeyRotate checks if the active key is old enough that we need +// to kick off a rotation. +func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error { rotationThreshold := c.getThreshold(eval, "root key", "root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold) @@ -982,13 +903,13 @@ func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) ws := memdb.NewWatchSet() activeKey, err := c.snap.GetActiveRootKeyMeta(ws) if err != nil { - return false, err + return err } if activeKey == nil { - return false, nil // no active key + return nil // no active key } if activeKey.CreateIndex >= rotationThreshold { - return false, nil // key is too new + return nil // key is too new } req := &structs.KeyringRotateRootKeyRequest{ @@ -1000,10 +921,10 @@ func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) if err := c.srv.RPC("Keyring.Rotate", req, &structs.KeyringRotateRootKeyResponse{}); err != nil { c.logger.Error("root key rotation failed", "error", err) - return false, err + return err } - return true, nil + return nil } // variablesReKey is optionally run after rotating the active diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 27b8410f3c48..66aa3b2d9f33 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -2471,73 +2471,33 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) { require.NotNil(t, key0, "expected keyring to be bootstapped") require.NoError(t, err) - // insert an "old" and inactive key - key1 := structs.NewRootKeyMeta() - key1.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(500, key1, false)) - - // insert an "old" and inactive key with a variable that's using it - key2 := structs.NewRootKeyMeta() - key2.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(600, key2, false)) - - variable := mock.VariableEncrypted() - variable.KeyID = key2.KeyID - - setResp := store.VarSet(601, &structs.VarApplyStateRequest{ - Op: structs.VarOpSet, - Var: variable, - }) - require.NoError(t, setResp.Error) - - // insert an allocation - alloc := mock.Alloc() - alloc.ClientStatus = structs.AllocClientStatusRunning - require.NoError(t, store.UpsertAllocs( - structs.MsgTypeTestSetup, 700, []*structs.Allocation{alloc})) - - // insert an "old" key that's newer than oldest alloc - key3 := structs.NewRootKeyMeta() - key3.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(750, key3, false)) - - // insert a time table index before the last key - tt := srv.fsm.TimeTable() - tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold)) - - // insert a "new" but inactive key - key4 := structs.NewRootKeyMeta() - key4.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(1500, key4, false)) - // run the core job snap, err := store.Snapshot() require.NoError(t, err) core := NewCoreScheduler(srv, snap) eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000) c := core.(*CoreScheduler) - require.NoError(t, c.rootKeyRotateOrGC(eval)) + require.NoError(t, c.rootKeyRotate(eval)) - ws := memdb.NewWatchSet() - key, err := store.RootKeyMetaByID(ws, key0.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "active key should not have been GCd") + got, err := store.GetActiveRootKeyMeta(nil) + require.NotNil(t, got, "expected keyring to have an active key") + require.Equal(t, got.KeyID, key0.KeyID) - key, err = store.RootKeyMetaByID(ws, key1.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old key should have been GCd") - - key, err = store.RootKeyMetaByID(ws, key2.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key should not have been GCd if still in use") - - key, err = store.RootKeyMetaByID(ws, key3.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key newer than oldest alloc should not have been GCd") + // insert a time table index after the key + tt := srv.fsm.TimeTable() + tt.Witness(3000, time.Now().UTC().Add(-1*srv.config.RootKeyRotationThreshold)) - key, err = store.RootKeyMetaByID(ws, key4.KeyID) + // re-run the core job + snap, err = store.Snapshot() require.NoError(t, err) - require.NotNil(t, key, "new key should not have been GCd") + core = NewCoreScheduler(srv, snap) + eval = srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 4000) + c = core.(*CoreScheduler) + require.NoError(t, c.rootKeyRotate(eval)) + + got, err = store.GetActiveRootKeyMeta(nil) + require.NotNil(t, got, "expected keyring to have an active key") + require.NotEqual(t, got.KeyID, key0.KeyID) } // TestCoreScheduler_VariablesRekey exercises variables rekeying diff --git a/website/content/docs/configuration/server.mdx b/website/content/docs/configuration/server.mdx index d12ae04fdff5..efdc68f04f4b 100644 --- a/website/content/docs/configuration/server.mdx +++ b/website/content/docs/configuration/server.mdx @@ -200,15 +200,11 @@ server { rejoin the cluster. - `root_key_gc_interval` `(string: "10m")` - Specifies the interval between - [encryption key][] metadata garbage collections. - -- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time that an - [encryption key][] must exist before it can be eligible for garbage - collection. + checks to rotate the root [encryption key][]. - `root_key_rotation_threshold` `(string: "720h")` - Specifies the minimum time that an [encryption key][] must exist before it is automatically rotated on - the next garbage collection interval. + the next `root_key_gc_interval`. - `server_join` ([server_join][server-join]: nil) - Specifies how the Nomad server will connect to other Nomad servers. The `retry_join` @@ -392,7 +388,7 @@ regardless of cluster size. The base formula for determining how often a Client must heartbeat is: ``` - / + / ``` Other factors modify this base TTL: @@ -408,7 +404,7 @@ Other factors modify this base TTL: to successfully heartbeat. This gives Clients time to discover a functioning Server in case they were directly connected to a leader that crashed. -For example, given the default values for heartbeat parameters, different sized +For example, given the default values for heartbeat parameters, different sized clusters will use the following TTLs for the heartbeats. Note that the `Server TTL` simply adds the `heartbeat_grace` parameter to the TTL Clients are given. @@ -425,7 +421,7 @@ Regardless of size, all clients will have a Server TTL of than the maximum Client TTL for your cluster size in order to prevent marking live Clients as `down`. -For clusters over 5000 Clients you should increase `failover_heartbeat_ttl` +For clusters over 5000 Clients you should increase `failover_heartbeat_ttl` using the following formula: ```