Skip to content

Commit

Permalink
Try to get the running version based on the reachable coordinators du…
Browse files Browse the repository at this point in the history
…ring an upgrade (#2098)

* Try to get the running version based on the reachable coordinators during an upgrade
  • Loading branch information
johscheuer committed Jul 10, 2024
1 parent 9b6db3d commit 1291561
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 9 deletions.
16 changes: 16 additions & 0 deletions controllers/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,23 @@ func (r *FoundationDBClusterReconciler) getStatusFromClusterOrDummyStatus(logger
return status, nil
}

// When we reached this part of the code the above GetStatus() called failed for some reason.
if cluster.Status.Configured {
// If the cluster is currently under a version incompatible upgrade, we try to assume the current version based
// on the coordinator reachability. If all (or the majority) of coordinators are reachable with a specific version
// of fdbcli we can assume that the cluster is running with that version and update the cluster.Status.RunningVersion.
// In theory we could use the go bindings if they would expose that information from the multi-version bindings.
if cluster.IsBeingUpgradedWithVersionIncompatibleVersion() {
// Set the version from the reachable coordinators, if the version points to the desired version defined
// in the cluster.Spec.Version, this will unblock some further steps, to allow the operator to bring the cluster
// back into a better state.
versionFromReachableCoordinators := adminClient.GetVersionFromReachableCoordinators()
if versionFromReachableCoordinators != "" && versionFromReachableCoordinators != cluster.Status.RunningVersion {
logger.Info("Update running version in cluster status from reachable coordinators", "versionFromReachableCoordinators", versionFromReachableCoordinators, "currentRunningVersion", cluster.Status.RunningVersion)
cluster.Status.RunningVersion = versionFromReachableCoordinators
}
}

return nil, err
}

Expand Down
9 changes: 8 additions & 1 deletion e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,14 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
DescribeTable(
"when no remote storage processes are restarted",
func(beforeVersion string, targetVersion string) {
clusterSetup(beforeVersion, false)
// We disable the health check here, as the remote storage processes are not restarted and this can be
// disruptive to the cluster.
clusterSetupWithTestConfig(testConfig{
beforeVersion: beforeVersion,
enableOperatorPodChaos: false,
enableHealthCheck: false,
loadData: false,
})

// Select remote storage processes and use the buggify option to skip those
// processes during the restart command.
Expand Down
45 changes: 41 additions & 4 deletions fdbclient/admin_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ func (client *cliAdminClient) runCommand(command cliCommand) (string, error) {
}

// getStatusFromCli uses the fdbcli to connect to the FDB cluster
func (client *cliAdminClient) getStatusFromCli() (*fdbv1beta2.FoundationDBStatus, error) {
func (client *cliAdminClient) getStatusFromCli(checkForProcesses bool) (*fdbv1beta2.FoundationDBStatus, error) {
// Always use the max timeout here. Otherwise we will retry multiple times with an increasing timeout. As the
// timeout is only the upper bound using directly the max timeout reduces the calls to a single call.
output, err := client.runCommand(cliCommand{command: "status json", timeout: client.getTimeout()})
Expand All @@ -288,13 +288,13 @@ func (client *cliAdminClient) getStatusFromCli() (*fdbv1beta2.FoundationDBStatus
return nil, err
}

return parseMachineReadableStatus(client.log, contents)
return parseMachineReadableStatus(client.log, contents, checkForProcesses)
}

// getStatus uses fdbcli to connect to the FDB cluster, if the cluster is upgraded and the initial version returns no processes
// the new version for fdbcli will be tried.
func (client *cliAdminClient) getStatus() (*fdbv1beta2.FoundationDBStatus, error) {
status, err := client.getStatusFromCli()
status, err := client.getStatusFromCli(true)

// If the cluster is under an upgrade and the getStatus call returns an error, we have to retry it with the new version,
// as it could be that the wrong version was selected.
Expand All @@ -305,7 +305,7 @@ func (client *cliAdminClient) getStatus() (*fdbv1beta2.FoundationDBStatus, error
clusterCopy.Status.RunningVersion = clusterCopy.Spec.Version
client.Cluster = clusterCopy

return client.getStatusFromCli()
return client.getStatusFromCli(true)
}

return status, err
Expand Down Expand Up @@ -823,3 +823,40 @@ func (client *cliAdminClient) SetProcessesUnderMaintenance(processGroupIDs []fdb

return err
}

// GetVersionFromReachableCoordinators will return the running version based on the reachable coordinators. This method
// can be used during version incompatible upgrades and based on the responses of the coordinators, this method will
// assume the current running version of the cluster. If the fdbcli calls for none of the provided version return
// a majority of reachable coordinators, an empty string will be returned.
func (client *cliAdminClient) GetVersionFromReachableCoordinators() string {
// First we test to get the status from the fdbcli with the current running version defined in cluster.Status.RunningVersion.
status, _ := client.getStatusFromCli(false)
if quorumOfCoordinatorsAreReachable(status) {
return client.Cluster.GetRunningVersion()
}

// If the majority of coordinators are not reachable with the cluster.Status.RunningVersion, we try the desired version
// if the cluster is currently performing an version incompatible upgrade.
if client.Cluster.IsBeingUpgradedWithVersionIncompatibleVersion() {
// Create a copy of the cluster and make use of the desired version instead of the last observed running version.
clusterCopy := client.Cluster.DeepCopy()
clusterCopy.Status.RunningVersion = clusterCopy.Spec.Version
client.Cluster = clusterCopy
status, _ = client.getStatusFromCli(false)
if quorumOfCoordinatorsAreReachable(status) {
return clusterCopy.GetRunningVersion()
}
}

return ""
}

// quorumOfCoordinatorsAreReachable return false if the status is nil otherwise it will return the value of QuorumReachable.
// QuorumReachable will be true if the client was able to reach a quorum of the coordinators.
func quorumOfCoordinatorsAreReachable(status *fdbv1beta2.FoundationDBStatus) bool {
if status == nil {
return false
}

return status.Client.Coordinators.QuorumReachable
}
127 changes: 126 additions & 1 deletion fdbclient/admin_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1167,5 +1167,130 @@ protocol fdb00b071010000`,
})
})

// TODO(johscheuer): Add test case for timeout.
When("getting the version from the reachable coordinators", func() {
var mockRunner *mockCommandRunner
var version, previousBinary, newBinary, quorumReachableStatus, quorumNotReachableStatus string
var err error
previousVersion := "7.1.55"
newVersion := "7.3.33"

JustBeforeEach(func() {
cliClient := &cliAdminClient{
Cluster: &fdbv1beta2.FoundationDBCluster{
Spec: fdbv1beta2.FoundationDBClusterSpec{
Version: newVersion,
},
Status: fdbv1beta2.FoundationDBClusterStatus{
RunningVersion: previousVersion,
},
},
clusterFilePath: "test",
log: logr.Discard(),
cmdRunner: mockRunner,
}

version = cliClient.GetVersionFromReachableCoordinators()
})

BeforeEach(func() {
tmpDir := GinkgoT().TempDir()
GinkgoT().Setenv("FDB_BINARY_DIR", tmpDir)

binaryDir := path.Join(tmpDir, "7.1")
Expect(os.MkdirAll(binaryDir, 0700)).NotTo(HaveOccurred())
previousBinary = path.Join(binaryDir, fdbcliStr)
_, err := os.Create(previousBinary)
Expect(err).NotTo(HaveOccurred())

binaryDir = path.Join(tmpDir, "7.3")
Expect(os.MkdirAll(binaryDir, 0700)).NotTo(HaveOccurred())
newBinary = path.Join(binaryDir, fdbcliStr)
_, err = os.Create(newBinary)
Expect(err).NotTo(HaveOccurred())

quorumReachable := &fdbv1beta2.FoundationDBStatus{
Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{
DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{
Available: true,
},
Coordinators: fdbv1beta2.FoundationDBStatusCoordinatorInfo{
QuorumReachable: true,
},
},
}
quorumReachableStatusOut, err := json.Marshal(quorumReachable)
Expect(err).NotTo(HaveOccurred())
quorumReachableStatus = string(quorumReachableStatusOut)

quorumNotReachable := &fdbv1beta2.FoundationDBStatus{
Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{
Coordinators: fdbv1beta2.FoundationDBStatusCoordinatorInfo{
QuorumReachable: false,
},
},
}
quorumNotReachableStatusOut, err := json.Marshal(quorumNotReachable)
Expect(err).NotTo(HaveOccurred())
quorumNotReachableStatus = string(quorumNotReachableStatusOut)
})

When("the fdbcli call for the previous version returns that the quorum is reachable", func() {
BeforeEach(func() {
mockRunner = &mockCommandRunner{
mockedError: nil,
mockedOutputPerBinary: map[string]string{
previousBinary: quorumReachableStatus,
newBinary: quorumNotReachableStatus,
},
}
})

It("should report the previous version", func() {
Expect(version).To(Equal(previousVersion))
Expect(err).NotTo(HaveOccurred())
Expect(mockRunner.receivedBinary).To(HaveLen(1))
Expect(mockRunner.receivedBinary[0]).To(HaveSuffix("7.1/" + fdbcliStr))
})
})

When("the fdbcli call for the new version returns that the quorum is reachable", func() {
BeforeEach(func() {
mockRunner = &mockCommandRunner{
mockedError: nil,
mockedOutputPerBinary: map[string]string{
previousBinary: quorumNotReachableStatus,
newBinary: quorumReachableStatus,
},
}
})

It("should report the new version", func() {
Expect(version).To(Equal(newVersion))
Expect(err).NotTo(HaveOccurred())
Expect(mockRunner.receivedBinary).To(HaveLen(2))
Expect(mockRunner.receivedBinary[0]).To(HaveSuffix("7.1/" + fdbcliStr))
Expect(mockRunner.receivedBinary[1]).To(HaveSuffix("7.3/" + fdbcliStr))
})
})

When("none of the fdbcli calls returns that the quorum is reachable", func() {
BeforeEach(func() {
mockRunner = &mockCommandRunner{
mockedError: nil,
mockedOutputPerBinary: map[string]string{
previousBinary: quorumNotReachableStatus,
newBinary: quorumNotReachableStatus,
},
}
})

It("should report an empty string", func() {
Expect(version).To(BeEmpty())
Expect(err).NotTo(HaveOccurred())
Expect(mockRunner.receivedBinary).To(HaveLen(2))
Expect(mockRunner.receivedBinary[0]).To(HaveSuffix("7.1/" + fdbcliStr))
Expect(mockRunner.receivedBinary[1]).To(HaveSuffix("7.3/" + fdbcliStr))
})
})
})
})
6 changes: 3 additions & 3 deletions fdbclient/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ const (
defaultTransactionTimeout = 5 * time.Second
)

func parseMachineReadableStatus(logger logr.Logger, contents []byte) (*fdbv1beta2.FoundationDBStatus, error) {
func parseMachineReadableStatus(logger logr.Logger, contents []byte, checkForProcesses bool) (*fdbv1beta2.FoundationDBStatus, error) {
status := &fdbv1beta2.FoundationDBStatus{}
err := json.Unmarshal(contents, status)
if err != nil {
Expand Down Expand Up @@ -75,7 +75,7 @@ func parseMachineReadableStatus(logger logr.Logger, contents []byte) (*fdbv1beta
}
}

if len(status.Cluster.Processes) == 0 {
if len(status.Cluster.Processes) == 0 && checkForProcesses {
logger.Info("machine-readable status is missing process information")
return nil, fdbv1beta2.TimeoutError{Err: fmt.Errorf("machine-readable status is missing process information")}
}
Expand Down Expand Up @@ -141,7 +141,7 @@ func getStatusFromDB(libClient fdbLibClient, logger logr.Logger, timeout time.Du
return nil, err
}

return parseMachineReadableStatus(logger, contents)
return parseMachineReadableStatus(logger, contents, true)
}

type realDatabaseClientProvider struct {
Expand Down
6 changes: 6 additions & 0 deletions pkg/fdbadminclient/admin_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,10 @@ type AdminClient interface {
// down for maintenance. The value will be the provided time stamp. If a process group is already present in the
// list, the timestamp will be updated.
SetProcessesUnderMaintenance([]fdbv1beta2.ProcessGroupID, int64) error

// GetVersionFromReachableCoordinators will return the running version based on the reachable coordinators. This method
// can be used during version incompatible upgrades and based on the responses of the coordinators, this method will
// assume the current running version of the cluster. If the fdbcli calls for none of the provided version return
// a majority of reachable coordinators, the default version from the cluster.Status.RunningVersion will be returned.
GetVersionFromReachableCoordinators() string
}
9 changes: 9 additions & 0 deletions pkg/fdbadminclient/mock/admin_client_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -1098,3 +1098,12 @@ func (client *AdminClient) SetProcessesUnderMaintenance(ids []fdbv1beta2.Process

return nil
}

// GetVersionFromReachableCoordinators will return the running version based on the reachable coordinators. This method
// can be used during version incompatible upgrades and based on the responses of the coordinators, this method will
// assume the current running version of the cluster. If the fdbcli calls for none of the provided version return
// a majority of reachable coordinators, the default version from the cluster.Status.RunningVersion will be returned.
func (client *AdminClient) GetVersionFromReachableCoordinators() string {
// TODO (johscheuer): In the future we could allow to mock another running version.
return client.Cluster.Status.RunningVersion
}

0 comments on commit 1291561

Please sign in to comment.