From 13191fa15cba13cda72b0ddcc0496e6d85e185de Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Wed, 7 Jun 2017 16:58:49 +0300 Subject: [PATCH 1/6] support for force-master-failover command --- go/app/cli.go | 9 +++++++++ go/logic/topology_recovery.go | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/go/app/cli.go b/go/app/cli.go index 5f6037ee4..755b37b8d 100644 --- a/go/app/cli.go +++ b/go/app/cli.go @@ -1384,6 +1384,15 @@ func Cli(command string, strict bool, instance string, destination string, owner fmt.Println(promotedInstanceKey.DisplayString()) } } + case registerCliCommand("force-master-failover", "Recovery", `Forcibly discard master and initiate a failover, even if orchestrator doesn't see a problem. This command lets orchestrator choose the replacement master`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + topologyRecovery, err := logic.ForceMasterFailover(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) + } case registerCliCommand("force-master-takeover", "Recovery", `Forcibly discard master and promote another (direct child) instance instead, even if everything is running well`): { clusterName := getClusterName(clusterAlias, instanceKey) diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go index 7737e5810..ffd6d38f9 100644 --- a/go/logic/topology_recovery.go +++ b/go/logic/topology_recovery.go @@ -1403,6 +1403,33 @@ func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, fa return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) } +// ForceMasterFailover *trusts* master of given cluster is dead and initiates a failover +func ForceMasterFailover(clusterName string) (topologyRecovery *TopologyRecovery, err error) { + clusterMasters, err := inst.ReadClusterWriteableMaster(clusterName) + if err != nil { + return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName) + } + if len(clusterMasters) != 1 { + return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName) + } + clusterMaster := clusterMasters[0] + + recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, nil, false) + if err != nil { + return nil, err + } + if !recoveryAttempted { + return nil, fmt.Errorf("Unexpected error: recovery not attempted. This should not happen") + } + if topologyRecovery == nil { + return nil, fmt.Errorf("Recovery attempted but with no results. This should not happen") + } + if topologyRecovery.SuccessorKey == nil { + return nil, fmt.Errorf("Recovery attempted yet no replica promoted") + } + return topologyRecovery, nil +} + // ForceMasterTakeover *trusts* master of given cluster is dead and fails over to designated instance, // which has to be its direct child. func ForceMasterTakeover(clusterName string, destination *inst.Instance) (topologyRecovery *TopologyRecovery, err error) { From 20ec58910aa14fa8142d9f4403f73eea4628dd51 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Thu, 8 Jun 2017 09:12:19 +0300 Subject: [PATCH 2/6] /api/force-master-failover added --- go/http/api.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/go/http/api.go b/go/http/api.go index 83f504d84..14d62c0fa 100644 --- a/go/http/api.go +++ b/go/http/api.go @@ -2186,6 +2186,37 @@ func (this *HttpAPI) Recover(params martini.Params, r render.Render, req *http.R } } +// ForceMasterFailover fails over a master (even if there's no particular problem with the master) +func (this *HttpAPI) ForceMasterFailover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + r.JSON(200, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, found, err := inst.ReadInstance(&instanceKey) + if (!found) || (err != nil) { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + + topologyRecovery, err := logic.ForceMasterFailover(instance.ClusterName) + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) + + if topologyRecovery.SuccessorKey != nil { + r.JSON(200, &APIResponse{Code: OK, Message: "Master failed over", Details: topologyRecovery}) + } else { + r.JSON(200, &APIResponse{Code: OK, Message: "Master not failed over", Details: topologyRecovery}) + } +} + // Registers promotion preference for given instance func (this *HttpAPI) RegisterCandidate(params martini.Params, r render.Render, req *http.Request, user auth.User) { if !isAuthorizedForAction(req, user) { @@ -2574,6 +2605,7 @@ func (this *HttpAPI) RegisterRequests(m *martini.ClassicMartini) { this.registerRequest(m, "recover/:host/:port/:candidateHost/:candidatePort", this.Recover) this.registerRequest(m, "recover-lite/:host/:port", this.RecoverLite) this.registerRequest(m, "recover-lite/:host/:port/:candidateHost/:candidatePort", this.RecoverLite) + this.registerRequest(m, "force-master-failover/:host/:port", this.ForceMasterFailover) this.registerRequest(m, "register-candidate/:host/:port/:promotionRule", this.RegisterCandidate) this.registerRequest(m, "automated-recovery-filters", this.AutomatedRecoveryFilters) this.registerRequest(m, "audit-failure-detection", this.AuditFailureDetection) From 58e8922d42d9026270fcc933901140b7f4e8bcf0 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Thu, 8 Jun 2017 09:12:30 +0300 Subject: [PATCH 3/6] added web interface for failing over master --- resources/public/js/cluster.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/resources/public/js/cluster.js b/resources/public/js/cluster.js index ba6d938bc..6c32967b3 100644 --- a/resources/public/js/cluster.js +++ b/resources/public/js/cluster.js @@ -23,6 +23,10 @@ function Cluster() { apiCommand("/api/recover-lite/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port); return true; }, + "force-master-failover": function(e) { + apiCommand("/api/force-master-failover/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port); + return true; + }, "match-up-replicas": function(e) { apiCommand("/api/match-up-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port); return true; @@ -1366,6 +1370,10 @@ function Cluster() { recoveryListing.append('
  • Auto (do not execute hooks/processes)
  • '); recoveryListing.append(''); + if (instance.isMaster) { + recoveryListing.append('
  • Force fail over now (even if normal handling would not fail over)
  • '); + recoveryListing.append(''); + } if (!instance.isMaster) { recoveryListing.append('
  • Match up replicas to ' + instance.masterTitle + '
  • '); } From 0970018991f23b6148d1941f50fc0a53289dd65c Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Thu, 8 Jun 2017 10:45:47 +0300 Subject: [PATCH 4/6] fixing listener on recover listing --- resources/public/js/cluster.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/public/js/cluster.js b/resources/public/js/cluster.js index 6c32967b3..9f5cfa419 100644 --- a/resources/public/js/cluster.js +++ b/resources/public/js/cluster.js @@ -285,8 +285,9 @@ function Cluster() { openNodeModal(_instancesMap[draggedNodeId]); return false; }); + $("body").on("click", ".instance a[data-command], .instance button[data-command]", function(e) { - var target = $(e.target); + var target = $(e.target).closest("a"); var instanceEl = target.closest(".instance"); e.draggedNodeId = instanceEl.attr("data-nodeid"); From 34ee413229c869a0c6afff4b2ce57a0b8e4382c9 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Thu, 8 Jun 2017 16:43:09 +0300 Subject: [PATCH 5/6] documenting force-master-failover --- go/app/prompt.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/go/app/prompt.go b/go/app/prompt.go index 070a22e17..05506a229 100644 --- a/go/app/prompt.go +++ b/go/app/prompt.go @@ -624,7 +624,7 @@ Cheatsheet: which-lost-in-recovery List instances marked as downtimed for being lost in a recovery process. The output of this command lists - "lost" instances that probabaly should be recycled. + "lost" instances that probabaly should be recycled. The topology recovery process injects a magic hint when downtiming lost instances, that is picked up by this command. Examples: @@ -771,6 +771,14 @@ Cheatsheet: orchestrator -c recover-lite -i dead.instance.com --debug + force-master-failover + Forcibly begin a master failover process, even if orchestrator does not see anything wrong + in particular with the master. + - This will not work in a master-master configuration + - Orchestrator just treats this command as a DeadMaster failover scenario + - Orchestrator will issue all relevant pre-failover and post-failover external processes. + - Orchestrator will not attempt to recover/reconnect the old master + force-master-takeover Forcibly discard master and promote another (direct child) instance instead, even if everything is running well. This allows for planned switchover. From 99c2bc5b6624e582858af691a6bf391c579f66d7 Mon Sep 17 00:00:00 2001 From: Shlomi Noach Date: Thu, 8 Jun 2017 21:47:05 +0300 Subject: [PATCH 6/6] simplified recovery options on 'recover' button. For masters: just fail over, or attempt to fail over onto specific instances. No more 'lite' recoveries. People really just want to fail over. For intermediate masters: 'relocate-replicas' instead of match* commands; no 'lite' option --- resources/public/js/cluster.js | 42 ++++++++++++---------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/resources/public/js/cluster.js b/resources/public/js/cluster.js index 9f5cfa419..9f2ba3e7b 100644 --- a/resources/public/js/cluster.js +++ b/resources/public/js/cluster.js @@ -27,24 +27,16 @@ function Cluster() { apiCommand("/api/force-master-failover/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port); return true; }, - "match-up-replicas": function(e) { - apiCommand("/api/match-up-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port); - return true; - }, - "regroup-replicas": function(e) { - apiCommand("/api/regroup-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port); - return true; - }, "recover-suggested-successor": function(e) { - var suggestedSuccessorHost = $(e.target).attr("data-suggested-successor-host"); - var suggestedSuccessorPort = $(e.target).attr("data-suggested-successor-port"); + var suggestedSuccessorHost = $(e.target).attr("data-successor-host"); + var suggestedSuccessorPort = $(e.target).attr("data-successor-port"); apiCommand("/api/recover/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port + "/" + suggestedSuccessorHost + "/" + suggestedSuccessorPort); return true; }, - "match-replicas": function(e) { - var belowHost = $(e.target).attr("data-below-host"); - var belowPort = $(e.target).attr("data-below-port"); - apiCommand("/api/match-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port + "/" + belowHost + "/" + belowPort); + "relocate-replicas": function(e) { + var belowHost = $(e.target).attr("data-successor-host"); + var belowPort = $(e.target).attr("data-successor-port"); + apiCommand("/api/relocate-replicas/" + _instancesMap[e.draggedNodeId].Key.Hostname + "/" + _instancesMap[e.draggedNodeId].Key.Port + "/" + belowHost + "/" + belowPort); return true; }, "make-master": function(e) { @@ -1367,21 +1359,11 @@ function Cluster() { popoverElement.find(".popover-footer .dropdown").append(''); popoverElement.find(".popover-footer .dropdown").append(''); var recoveryListing = popoverElement.find(".dropdown ul"); - recoveryListing.append('
  • Auto (implies running external hooks/processes)
  • '); - recoveryListing.append('
  • Auto (do not execute hooks/processes)
  • '); - recoveryListing.append(''); if (instance.isMaster) { recoveryListing.append('
  • Force fail over now (even if normal handling would not fail over)
  • '); recoveryListing.append(''); - } - if (!instance.isMaster) { - recoveryListing.append('
  • Match up replicas to ' + instance.masterTitle + '
  • '); - } - if (instance.children && instance.children.length > 1) { - recoveryListing.append('
  • Regroup replicas (auto pick best replica, only heals topology, no external processes)
  • '); - } - if (instance.isMaster) { + // Suggest successor instance.children.forEach(function(replica) { if (!replica.LogBinEnabled) { @@ -1400,9 +1382,14 @@ function Cluster() { return } recoveryListing.append( - '
  • Regroup replicas, try to promote ' + replica.title + '
  • '); + '
  • Recover, try to promote ' + replica.title + '
  • '); }); } + if (!instance.isMaster) { + recoveryListing.append('
  • Auto (implies running external hooks/processes)
  • '); + recoveryListing.append(''); + recoveryListing.append('
  • Relocate replicas to ' + instance.masterTitle + '
  • '); + } if (instance.masterNode) { // Intermediate master; suggest successor instance.masterNode.children.forEach(function(sibling) { @@ -1422,12 +1409,11 @@ function Cluster() { return } recoveryListing.append( - '
  • Match all replicas below ' + sibling.title + '
  • '); + '
  • Relocate replicas to ' + sibling.title + '
  • '); }); } } - function reviewReplicationAnalysis(replicationAnalysis) { var instancesMap = _instancesMap; var clusterHasReplicationAnalysisIssue = false;