Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Failure analysis contains command hint, advertised to hooks #442

Merged
merged 1 commit into from
Mar 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions go/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,10 @@ type Configuration struct {
RecoverMasterClusterFilters []string // Only do master recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything)
RecoverIntermediateMasterClusterFilters []string // Only do IM recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything)
ProcessesShellCommand string // Shell that executes command scripts
OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {autoMasterRecovery}, {autoIntermediateMasterRecovery}
PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}
PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}
PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}
OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {autoMasterRecovery}, {autoIntermediateMasterRecovery}
PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}
PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}
PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}
PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
UnreachableMasterWithStaleSlavesProcesses []string // Processes to execute when detecting an UnreachableMasterWithStaleSlaves scenario.
Expand Down
1 change: 1 addition & 0 deletions go/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ type ReplicationAnalysis struct {
GTIDMode string
MinReplicaGTIDMode string
MaxReplicaGTIDMode string
CommandHint string
}

type AnalysisMap map[string](*ReplicationAnalysis)
Expand Down
11 changes: 7 additions & 4 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ func replaceCommandPlaceholders(command string, topologyRecovery *TopologyRecove
analysisEntry := &topologyRecovery.AnalysisEntry
command = strings.Replace(command, "{failureType}", string(analysisEntry.Analysis), -1)
command = strings.Replace(command, "{failureDescription}", analysisEntry.Description, -1)
command = strings.Replace(command, "{command}", analysisEntry.CommandHint, -1)
command = strings.Replace(command, "{failedHost}", analysisEntry.AnalyzedInstanceKey.Hostname, -1)
command = strings.Replace(command, "{failedPort}", fmt.Sprintf("%d", analysisEntry.AnalyzedInstanceKey.Port), -1)
command = strings.Replace(command, "{failureCluster}", analysisEntry.ClusterDetails.ClusterName, -1)
Expand Down Expand Up @@ -287,6 +288,7 @@ func applyEnvironmentVariables(topologyRecovery *TopologyRecovery) []string {
env := goos.Environ()
env = append(env, fmt.Sprintf("ORC_FAILURE_TYPE=%s", string(analysisEntry.Analysis)))
env = append(env, fmt.Sprintf("ORC_FAILURE_DESCRIPTION=%s", analysisEntry.Description))
env = append(env, fmt.Sprintf("ORC_COMMAND=%s", analysisEntry.CommandHint))
env = append(env, fmt.Sprintf("ORC_FAILED_HOST=%s", analysisEntry.AnalyzedInstanceKey.Hostname))
env = append(env, fmt.Sprintf("ORC_FAILED_PORT=%d", analysisEntry.AnalyzedInstanceKey.Port))
env = append(env, fmt.Sprintf("ORC_FAILURE_CLUSTER=%s", analysisEntry.ClusterDetails.ClusterName))
Expand Down Expand Up @@ -1524,7 +1526,7 @@ func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *i
// ForceExecuteRecovery can be called to issue a recovery process even if analysis says there is no recovery case.
// The caller of this function injects the type of analysis it wishes the function to assume.
// By calling this function one takes responsibility for one's actions.
func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, failedInstanceKey *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, commandHint string, failedInstanceKey *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
clusterInfo, err := inst.ReadClusterInfo(clusterName)
if err != nil {
return recoveryAttempted, topologyRecovery, err
Expand All @@ -1543,6 +1545,7 @@ func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, fa
}
}
analysisEntry.Analysis = analysisCode // we force this analysis
analysisEntry.CommandHint = commandHint
analysisEntry.ClusterDetails = *clusterInfo
analysisEntry.AnalyzedInstanceKey = *failedInstanceKey

Expand All @@ -1560,7 +1563,7 @@ func ForceMasterFailover(clusterName string) (topologyRecovery *TopologyRecovery
}
clusterMaster := clusterMasters[0]

recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, nil, false)
recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, "force-master-failover", &clusterMaster.Key, nil, false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -1593,7 +1596,7 @@ func ForceMasterTakeover(clusterName string, destination *inst.Instance) (topolo
}
log.Infof("Will demote %+v and promote %+v instead", clusterMaster.Key, destination.Key)

recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, &destination.Key, false)
recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, "force-master-takeover", &clusterMaster.Key, &destination.Key, false)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -1670,7 +1673,7 @@ func GracefulMasterTakeover(clusterName string) (topologyRecovery *TopologyRecov
}
promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates

recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, &designatedInstance.Key, false)
recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, "graceful-master-takeover", &clusterMaster.Key, &designatedInstance.Key, false)
if err != nil {
return nil, nil, err
}
Expand Down