Skip to content

Commit

Permalink
fix panic from keyring raft entries being written during upgrade (#14821
Browse files Browse the repository at this point in the history
)

During an upgrade to Nomad 1.4.0, if a server running 1.4.0 becomes the leader
before one of the 1.3.x servers, the old server will crash because the keyring
is initialized and writes a raft entry.

Wait until all members are on a version that supports the keyring before
initializing it.
  • Loading branch information
tgross committed Oct 6, 2022
1 parent 029c518 commit 6e108d3
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .changelog/14821.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
keyring: Fixed a panic that can occur during upgrades to 1.4.0 when initializing the keyring
```
3 changes: 3 additions & 0 deletions nomad/encrypter.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ func (e *Encrypter) activeKeySetLocked() (*keyset, error) {
if err != nil {
return nil, err
}
if keyMeta == nil {
return nil, fmt.Errorf("keyring has not been initialized yet")
}

return e.keysetByIDLocked(keyMeta.KeyID)
}
Expand Down
46 changes: 33 additions & 13 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,7 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
schedulerConfig := s.getOrCreateSchedulerConfig()

// Create the first root key if it doesn't already exist
err := s.initializeKeyring()
if err != nil {
return err
}
go s.initializeKeyring(stopCh)

// Initialize the ClusterID
_, _ = s.ClusterID()
Expand Down Expand Up @@ -1966,43 +1963,66 @@ func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration {
return config
}

var minVersionKeyring = version.Must(version.NewVersion("1.4.0"))

// initializeKeyring creates the first root key if the leader doesn't
// already have one. The metadata will be replicated via raft and then
// the followers will get the key material from their own key
// replication.
func (s *Server) initializeKeyring() error {
func (s *Server) initializeKeyring(stopCh <-chan struct{}) {

logger := s.logger.Named("keyring")

store := s.fsm.State()
keyMeta, err := store.GetActiveRootKeyMeta(nil)
if err != nil {
return err
logger.Error("failed to get active key: %v", err)
return
}
if keyMeta != nil {
return nil
return
}

logger.Trace("verifying cluster is ready to initialize keyring")
for {
select {
case <-stopCh:
return
default:
}
if ServersMeetMinimumVersion(s.serf.Members(), minVersionKeyring, true) {
break
}
}
// we might have lost leadershuip during the version check
if !s.IsLeader() {
return
}

s.logger.Named("core").Trace("initializing keyring")
logger.Trace("initializing keyring")

rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM)
rootKey.Meta.SetActive()
if err != nil {
return fmt.Errorf("could not initialize keyring: %v", err)
logger.Error("could not initialize keyring: %v", err)
return
}

err = s.encrypter.AddKey(rootKey)
if err != nil {
return fmt.Errorf("could not add initial key to keyring: %v", err)
logger.Error("could not add initial key to keyring: %v", err)
return
}

if _, _, err = s.raftApply(structs.RootKeyMetaUpsertRequestType,
structs.KeyringUpdateRootKeyMetaRequest{
RootKeyMeta: rootKey.Meta,
}); err != nil {
return fmt.Errorf("could not initialize keyring: %v", err)
logger.Error("could not initialize keyring: %v", err)
return
}

s.logger.Named("core").Info("initialized keyring", "id", rootKey.Meta.KeyID)
return nil
logger.Info("initialized keyring", "id", rootKey.Meta.KeyID)
}

func (s *Server) generateClusterID() (string, error) {
Expand Down
2 changes: 1 addition & 1 deletion nomad/plan_apply_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ func TestPlanApply_applyPlanWithNormalizedAllocs(t *testing.T) {
ci.Parallel(t)

s1, cleanupS1 := TestServer(t, func(c *Config) {
c.Build = "0.9.2"
c.Build = "1.4.0"
})
defer cleanupS1()
testutil.WaitForLeader(t, s1.RPC)
Expand Down
2 changes: 1 addition & 1 deletion nomad/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ func TestWorker_SubmitPlanNormalizedAllocations(t *testing.T) {
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.NumSchedulers = 0
c.EnabledSchedulers = []string{structs.JobTypeService}
c.Build = "0.9.2"
c.Build = "1.4.0"
})
defer cleanupS1()
testutil.WaitForLeader(t, s1.RPC)
Expand Down

0 comments on commit 6e108d3

Please sign in to comment.