Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check if the database is available before doing any exclusion checks #1758

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions controllers/remove_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,7 @@ func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Log
continue
}

// ProcessGroup is already marked as excluded we can add it to the processGroupsToRemove and skip further
// checks.
// ProcessGroup is already marked as excluded we can add it to the processGroupsToRemove and skip further checks.
if processGroup.IsExcluded() {
processGroupsToRemove = append(processGroupsToRemove, processGroup)
continue
Expand Down
4 changes: 2 additions & 2 deletions controllers/remove_process_groups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,9 @@ var _ = Describe("remove_process_groups", func() {
}
})

It("should not remove that process group", func() {
It("should not remove the process group and should not exclude processes", func() {
Expect(result).NotTo(BeNil())
Expect(result.message).To(Equal("Removals cannot proceed because cluster has degraded fault tolerance"))
Expect(result.message).To(Equal("Reconciliation needs to exclude more processes"))
// Ensure resources are not deleted
removed, include, err := confirmRemoval(context.Background(), globalControllerLogger, clusterReconciler, cluster, removedProcessGroup.ProcessGroupID)
Expect(err).To(BeNil())
Expand Down
4 changes: 3 additions & 1 deletion controllers/update_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ func tryConnectionOptions(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCl
originalConnectionString := cluster.Status.ConnectionString
defer func() { cluster.Status.ConnectionString = originalConnectionString }()

var err error
for _, connectionString := range connectionStrings {
logger.Info("Attempting to get connection string from cluster", "connectionString", connectionString)
cluster.Status.ConnectionString = connectionString
Expand All @@ -309,7 +310,8 @@ func tryConnectionOptions(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCl
return originalConnectionString, clientErr
}

activeConnectionString, err := adminClient.GetConnectionString()
var activeConnectionString string
activeConnectionString, err = adminClient.GetConnectionString()

closeErr := adminClient.Close()
if closeErr != nil {
Expand Down
5 changes: 5 additions & 0 deletions fdbclient/admin_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,11 @@ protocol fdb00b071010000`,
Expect(err).NotTo(HaveOccurred())

status := &fdbv1beta2.FoundationDBStatus{
Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{
DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{
Available: true,
},
},
Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{
Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{
"1": { // This process is fully excluded
Expand Down
54 changes: 54 additions & 0 deletions pkg/fdbstatus/status_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@ import (
"math"
)

// forbiddenStatusMessages represents messages that could be part of the machine-readable status. Those messages can represent
// different error cases that have occurred when fetching the machine-readable status. A list of possible messages can be
// found here: https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/mr-status.rst?plain=1#L68-L97
// and here: https://apple.github.io/foundationdb/mr-status.html#message-components.
// We don't want to block the exclusion check for all messages, as some messages also indicate client issues or issues
// with a specific transaction priority.
var forbiddenStatusMessages = map[string]fdbv1beta2.None{
"unreadable_configuration": {},
"full_replication_timeout": {},
"storage_servers_error": {},
"log_servers_error": {},
}

// StatusContextKey will be used as a key in a context to pass down the cached status.
type StatusContextKey struct{}

Expand Down Expand Up @@ -67,6 +80,32 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
}
}

// If the database is unavailable the status might contain the processes but with missing role information. In this
// case we should not perform any validation on the machine-readable status as this information might not be correct.
// We don't want to run the exclude command to check for the status of the exclusions as this might result in a recovery
// of the cluster or brings the cluster into a worse state.
if !status.Client.DatabaseStatus.Available {
johscheuer marked this conversation as resolved.
Show resolved Hide resolved
logger.Info("Skipping exclusion check as the database is unavailable")
return exclusionStatus{
inProgress: nil,
fullyExcluded: nil,
notExcluded: addresses,
missingInStatus: nil,
}
}

// We have to make sure that the provided machine-readable status contains the required information, if any of the
// forbiddenStatusMessages is present, the operator is not able to make a decision if a set of processes is fully excluded
// or not.
if !clusterStatusHasValidRoleInformation(logger, status) {
return exclusionStatus{
inProgress: nil,
fullyExcluded: nil,
notExcluded: addresses,
missingInStatus: nil,
}
}

addressesToVerify := map[string]fdbv1beta2.None{}
for _, addr := range addresses {
addressesToVerify[addr.MachineAddress()] = fdbv1beta2.None{}
Expand Down Expand Up @@ -133,6 +172,21 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
return exclusions
}

// clusterStatusHasValidRoleInformation will check if the cluster part of the machine-readable status contains messages
// that indicate that not all role information could be fetched.
func clusterStatusHasValidRoleInformation(logger logr.Logger, status *fdbv1beta2.FoundationDBStatus) bool {
for _, message := range status.Cluster.Messages {
if _, ok := forbiddenStatusMessages[message.Name]; ok {
logger.Info("Skipping exclusion check as the machine-readable status includes a message that indicates an potential incomplete status",
"messages", status.Cluster.Messages,
"forbiddenStatusMessages", forbiddenStatusMessages)
return false
}
}

return true
}

// CanSafelyRemoveFromStatus checks whether it is safe to remove processes from the cluster, based on the provided status.
//
// The list returned by this method will be the addresses that are *not* safe to remove.
Expand Down
Loading
Loading