Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Commit

Permalink
Merge pull request #199 from github/force-master-recovery
Browse files Browse the repository at this point in the history
support for force-master-failover command
  • Loading branch information
Shlomi Noach authored Jun 11, 2017
2 parents eddec23 + dde72b3 commit d1ec06f
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 26 deletions.
9 changes: 9 additions & 0 deletions go/app/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -1384,6 +1384,15 @@ func Cli(command string, strict bool, instance string, destination string, owner
fmt.Println(promotedInstanceKey.DisplayString())
}
}
case registerCliCommand("force-master-failover", "Recovery", `Forcibly discard master and initiate a failover, even if orchestrator doesn't see a problem. This command lets orchestrator choose the replacement master`):
{
clusterName := getClusterName(clusterAlias, instanceKey)
topologyRecovery, err := logic.ForceMasterFailover(clusterName)
if err != nil {
log.Fatale(err)
}
fmt.Println(topologyRecovery.SuccessorKey.DisplayString())
}
case registerCliCommand("force-master-takeover", "Recovery", `Forcibly discard master and promote another (direct child) instance instead, even if everything is running well`):
{
clusterName := getClusterName(clusterAlias, instanceKey)
Expand Down
10 changes: 9 additions & 1 deletion go/app/prompt.go
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ Cheatsheet:
which-lost-in-recovery
List instances marked as downtimed for being lost in a recovery process. The output of this command lists
"lost" instances that probabaly should be recycled.
"lost" instances that probabaly should be recycled.
The topology recovery process injects a magic hint when downtiming lost instances, that is picked up
by this command. Examples:
Expand Down Expand Up @@ -771,6 +771,14 @@ Cheatsheet:
orchestrator -c recover-lite -i dead.instance.com --debug
force-master-failover
Forcibly begin a master failover process, even if orchestrator does not see anything wrong
in particular with the master.
- This will not work in a master-master configuration
- Orchestrator just treats this command as a DeadMaster failover scenario
- Orchestrator will issue all relevant pre-failover and post-failover external processes.
- Orchestrator will not attempt to recover/reconnect the old master
force-master-takeover
Forcibly discard master and promote another (direct child) instance instead, even if everything is running well.
This allows for planned switchover.
Expand Down
32 changes: 32 additions & 0 deletions go/http/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -2186,6 +2186,37 @@ func (this *HttpAPI) Recover(params martini.Params, r render.Render, req *http.R
}
}

// ForceMasterFailover fails over a master (even if there's no particular problem with the master)
func (this *HttpAPI) ForceMasterFailover(params martini.Params, r render.Render, req *http.Request, user auth.User) {
if !isAuthorizedForAction(req, user) {
r.JSON(200, &APIResponse{Code: ERROR, Message: "Unauthorized"})
return
}
instanceKey, err := this.getInstanceKey(params["host"], params["port"])
if err != nil {
r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()})
return
}
instance, found, err := inst.ReadInstance(&instanceKey)
if (!found) || (err != nil) {
r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)})
return
}

topologyRecovery, err := logic.ForceMasterFailover(instance.ClusterName)
if err != nil {
r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()})
return
}
fmt.Println(topologyRecovery.SuccessorKey.DisplayString())

if topologyRecovery.SuccessorKey != nil {
r.JSON(200, &APIResponse{Code: OK, Message: "Master failed over", Details: topologyRecovery})
} else {
r.JSON(200, &APIResponse{Code: OK, Message: "Master not failed over", Details: topologyRecovery})
}
}

// Registers promotion preference for given instance
func (this *HttpAPI) RegisterCandidate(params martini.Params, r render.Render, req *http.Request, user auth.User) {
if !isAuthorizedForAction(req, user) {
Expand Down Expand Up @@ -2574,6 +2605,7 @@ func (this *HttpAPI) RegisterRequests(m *martini.ClassicMartini) {
this.registerRequest(m, "recover/:host/:port/:candidateHost/:candidatePort", this.Recover)
this.registerRequest(m, "recover-lite/:host/:port", this.RecoverLite)
this.registerRequest(m, "recover-lite/:host/:port/:candidateHost/:candidatePort", this.RecoverLite)
this.registerRequest(m, "force-master-failover/:host/:port", this.ForceMasterFailover)
this.registerRequest(m, "register-candidate/:host/:port/:promotionRule", this.RegisterCandidate)
this.registerRequest(m, "automated-recovery-filters", this.AutomatedRecoveryFilters)
this.registerRequest(m, "audit-failure-detection", this.AuditFailureDetection)
Expand Down
27 changes: 27 additions & 0 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -1425,6 +1425,33 @@ func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, fa
return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses)
}

// ForceMasterFailover *trusts* master of given cluster is dead and initiates a failover
func ForceMasterFailover(clusterName string) (topologyRecovery *TopologyRecovery, err error) {
clusterMasters, err := inst.ReadClusterWriteableMaster(clusterName)
if err != nil {
return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName)
}
if len(clusterMasters) != 1 {
return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName)
}
clusterMaster := clusterMasters[0]

recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, nil, false)
if err != nil {
return nil, err
}
if !recoveryAttempted {
return nil, fmt.Errorf("Unexpected error: recovery not attempted. This should not happen")
}
if topologyRecovery == nil {
return nil, fmt.Errorf("Recovery attempted but with no results. This should not happen")
}
if topologyRecovery.SuccessorKey == nil {
return nil, fmt.Errorf("Recovery attempted yet no replica promoted")
}
return topologyRecovery, nil
}

// ForceMasterTakeover *trusts* master of given cluster is dead and fails over to designated instance,
// which has to be its direct child.
func ForceMasterTakeover(clusterName string, destination *inst.Instance) (topologyRecovery *TopologyRecovery, err error) {
Expand Down
45 changes: 20 additions & 25 deletions resources/public/js/cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,20 @@ function Cluster() {
apiCommand("/api/recover-lite/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port);
return true;
},
"match-up-replicas": function(e) {
apiCommand("/api/match-up-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port);
return true;
},
"regroup-replicas": function(e) {
apiCommand("/api/regroup-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port);
"force-master-failover": function(e) {
apiCommand("/api/force-master-failover/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port);
return true;
},
"recover-suggested-successor": function(e) {
var suggestedSuccessorHost = $(e.target).attr("data-suggested-successor-host");
var suggestedSuccessorPort = $(e.target).attr("data-suggested-successor-port");
var suggestedSuccessorHost = $(e.target).attr("data-successor-host");
var suggestedSuccessorPort = $(e.target).attr("data-successor-port");
apiCommand("/api/recover/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port + "/" + suggestedSuccessorHost + "/" + suggestedSuccessorPort);
return true;
},
"match-replicas": function(e) {
var belowHost = $(e.target).attr("data-below-host");
var belowPort = $(e.target).attr("data-below-port");
apiCommand("/api/match-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port + "/" + belowHost + "/" + belowPort);
"relocate-replicas": function(e) {
var belowHost = $(e.target).attr("data-successor-host");
var belowPort = $(e.target).attr("data-successor-port");
apiCommand("/api/relocate-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port + "/" + belowHost + "/" + belowPort);
return true;
},
"make-master": function(e) {
Expand Down Expand Up @@ -281,8 +277,9 @@ function Cluster() {
openNodeModal(_instancesMap[draggedNodeId]);
return false;
});

$("body").on("click", ".instance a[data-command], .instance button[data-command]", function(e) {
var target = $(e.target);
var target = $(e.target).closest("a");
var instanceEl = target.closest(".instance");
e.draggedNodeId = instanceEl.attr("data-nodeid");

Expand Down Expand Up @@ -1362,17 +1359,11 @@ function Cluster() {
popoverElement.find(".popover-footer .dropdown").append('<button type="button" class="btn btn-xs btn-default dropdown-toggle" id="recover_dropdown_' + instance.id + '" data-toggle="dropdown" aria-haspopup="true" aria-expanded="true"><span class="glyphicon glyphicon-heart text-danger"></span> Recover <span class="caret"></span></button><ul class="dropdown-menu" aria-labelledby="recover_dropdown_' + instance.id + '"></ul>');
popoverElement.find(".popover-footer .dropdown").append('<ul class="dropdown-menu" aria-labelledby="recover_dropdown_' + instance.id + '"></ul>');
var recoveryListing = popoverElement.find(".dropdown ul");
recoveryListing.append('<li><a href="#" data-btn="auto" data-command="recover-auto">Auto (implies running external hooks/processes)</a></li>');
recoveryListing.append('<li><a href="#" data-btn="auto-lite" data-command="recover-auto-lite">Auto (do not execute hooks/processes)</a></li>');
recoveryListing.append('<li role="separator" class="divider"></li>');

if (!instance.isMaster) {
recoveryListing.append('<li><a href="#" data-btn="match-up-replicas" data-command="match-up-replicas">Match up replicas to <code>' + instance.masterTitle + '</code></a></li>');
}
if (instance.children && instance.children.length > 1) {
recoveryListing.append('<li><a href="#" data-btn="regroup-replicas" data-command="regroup-replicas">Regroup replicas (auto pick best replica, only heals topology, no external processes)</a></li>');
}
if (instance.isMaster) {
recoveryListing.append('<li><a href="#" data-btn="force-master-failover" data-command="force-master-failover"><div class="glyphicon glyphicon-exclamation-sign text-danger"></div> <span class="text-danger">Force fail over <strong>now</strong> (even if normal handling would not fail over)</span></a></li>');
recoveryListing.append('<li role="separator" class="divider"></li>');

// Suggest successor
instance.children.forEach(function(replica) {
if (!replica.LogBinEnabled) {
Expand All @@ -1391,9 +1382,14 @@ function Cluster() {
return
}
recoveryListing.append(
'<li><a href="#" data-btn="recover-suggested-successor" data-command="recover-suggested-successor" data-suggested-successor-host="' + replica.Key.Hostname + '" data-suggested-successor-port="' + replica.Key.Port + '">Regroup replicas, try to promote <code>' + replica.title + '</code></a></li>');
'<li><a href="#" data-btn="recover-suggested-successor" data-command="recover-suggested-successor" data-successor-host="' + replica.Key.Hostname + '" data-successor-port="' + replica.Key.Port + '">Recover, try to promote <code>' + replica.title + '</code></a></li>');
});
}
if (!instance.isMaster) {
recoveryListing.append('<li><a href="#" data-btn="auto" data-command="recover-auto">Auto (implies running external hooks/processes)</a></li>');
recoveryListing.append('<li role="separator" class="divider"></li>');
recoveryListing.append('<li><a href="#" data-btn="relocate-replicas" data-command="relocate-replicas" data-successor-host="' + instance.MasterKey.Hostname + '" data-successor-port="' + instance.MasterKey.Port + '">Relocate replicas to <code>' + instance.masterTitle + '</code></a></li>');
}
if (instance.masterNode) {
// Intermediate master; suggest successor
instance.masterNode.children.forEach(function(sibling) {
Expand All @@ -1413,12 +1409,11 @@ function Cluster() {
return
}
recoveryListing.append(
'<li><a href="#" data-btn="match-replicas" data-command="match-replicas" data-below-host="' + sibling.Key.Hostname + '" data-below-port="' + sibling.Key.Port + '">Match all replicas below <code>' + sibling.title + '</code></a></li>');
'<li><a href="#" data-btn="relocate-replicas" data-command="relocate-replicas" data-successor-host="' + sibling.Key.Hostname + '" data-successor-port="' + sibling.Key.Port + '">Relocate replicas to <code>' + sibling.title + '</code></a></li>');
});
}
}


function reviewReplicationAnalysis(replicationAnalysis) {
var instancesMap = _instancesMap;
var clusterHasReplicationAnalysisIssue = false;
Expand Down

0 comments on commit d1ec06f

Please sign in to comment.